library(tree)
d=read.csv("states_m.csv")
t=tree(GiniIndex~.- StateAbbreviation-Name,data=d)
summary(t)

Regression tree:
tree(formula = GiniIndex ~ . - StateAbbreviation - Name, data = d)
Variables actually used in tree construction:
[1] "BusinessEthnicOwnershipFractionsBlack"
[2] "MedianHouseholdIncome"                
[3] "FarmLandArea"                         
[4] "MurderNonnegligentManslaughterRate"   
[5] "AverageACTMathematicsScore"           
[6] "BusinessEthnicOwnershipFractionsWhite"
Number of terminal nodes:  7 
Residual mean deviance:  8.613e-05 = 0.003704 / 43 
Distribution of residuals:
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-0.0193700 -0.0058420 -0.0008339  0.0000000  0.0058570  0.0235700 
plot(t)
text(t,pretty=0)

lm1=lm(GiniIndex ~ BusinessEthnicOwnershipFractionsBlack+MedianHouseholdIncome+
FarmLandArea+MurderNonnegligentManslaughterRate+AverageACTMathematicsScore+
BusinessEthnicOwnershipFractionsWhite, data=d)

t.predict=predict(t,d)
plot(d$GiniIndex,t.predict)
lm.predict=predict(lm1,d)
points(d$GiniIndex,lm.predict,pch=17,col="red")
abline(a=0,b=1)

tB=tree(GiniIndex~.- StateAbbreviation-Name,data=d,minsize=3)
summary(tB)

Regression tree:
tree(formula = GiniIndex ~ . - StateAbbreviation - Name, data = d, 
    minsize = 3)
Variables actually used in tree construction:
 [1] "BusinessEthnicOwnershipFractionsBlack"
 [2] "PersonsPerHousehold"                  
 [3] "BurglaryRate"                         
 [4] "PerCapitaPersonalIncome"              
 [5] "AverageACTReadingScore"               
 [6] "HealthInsuranceCoverageRate"          
 [7] "MurderNonnegligentManslaughterRate"   
 [8] "AverageACTMathematicsScore"           
 [9] "ForcibleRapeRate"                     
[10] "AverageSATWritingScore"               
Number of terminal nodes:  12 
Residual mean deviance:  2.168e-05 = 0.0008237 / 38 
Distribution of residuals:
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-0.009400 -0.002478 -0.000308  0.000000  0.002316  0.010910 
plot(tB)
text(tB,pretty=0)
> tB
node), split, n, deviance, yval
      * denotes terminal node

 1) root 50 1.763e-02 0.4624  
   2) BusinessEthnicOwnershipFractionsBlack < 2.85 22 4.274e-03 0.4481  
     4) PersonsPerHousehold < 2.80488 19 2.505e-03 0.4515  
       8) BurglaryRate < 0.003406 9 4.160e-04 0.4424  
        16) PerCapitaPersonalIncome < 55852.5 7 7.139e-05 0.4456 *
        17) PerCapitaPersonalIncome > 55852.5 2 2.113e-05 0.4312 *
       9) BurglaryRate > 0.003406 10 6.670e-04 0.4597  
        18) AverageACTReadingScore < 20.65 2 2.592e-05 0.4718 *
        19) AverageACTReadingScore > 20.65 8 2.745e-04 0.4567  
          38) HealthInsuranceCoverageRate < 34 7 3.543e-05 0.4587 *
          39) HealthInsuranceCoverageRate > 34 1 0.000e+00 0.4422 *
     5) PersonsPerHousehold > 2.80488 3 1.909e-04 0.4268 *
   3) BusinessEthnicOwnershipFractionsBlack > 2.85 28 5.324e-03 0.4736  
     6) MurderNonnegligentManslaughterRate < 5.15e-05 8 9.166e-04 0.4866  
      12) BurglaryRate < 0.0024845 2 1.201e-04 0.5024 *
      13) BurglaryRate > 0.0024845 6 1.286e-04 0.4814 *
     7) MurderNonnegligentManslaughterRate > 5.15e-05 20 2.522e-03 0.4685  
      14) AverageACTMathematicsScore < 20.85 12 5.860e-04 0.4754  
        28) ForcibleRapeRate < 0.0004675 8 1.803e-04 0.4794 *
        29) ForcibleRapeRate > 0.0004675 4 2.571e-05 0.4674 *
      15) AverageACTMathematicsScore > 20.85 8 4.879e-04 0.4580  
        30) AverageSATWritingScore < 480 4 7.607e-06 0.4504 *
        31) AverageSATWritingScore > 480 4 1.665e-05 0.4656 *

d2=read.csv("../msc/WHO19.csv")
is=function(i){return(sum(is.na(d2[,i])))}
sapply(1:77,is)
 [1]   0   0   0  16  62  15  14  14   0   0   0  14   0   1  12  85  13   0   0
[20]   0   0   0   0   2   2  58   0   2   2   0   0  13  15  64  19  14  20  36
[39] 111  29  48  15  14  34  40  64  49  64  16  13  17  30  11  64  28  14  33
[58]  62  62  61  24  24  28  38  47  32  14  28  26  28  17  66  13  15  21  13
[77]  13

names(d2)[sapply(1:77,is)<20]
d2=d2[,sapply(1:77,is)<20]

t2=tree(CO2_emissions~GNIperCapita+Population+Urban_pop+Internet_users+Health_expenditure_per_person+lifeFemale+school_enrolmentM.,data=d2)
plot(t2)
text(t2,pretty=0)

t3=tree(CO2_emissions~.-Country,data=d2)
summary(t3)
Variables actually used in tree construction:
[1] "GNIperCapita"      "Income_per_person" "nurses"           
[4] "Births_Skilled."   "Forest_area"      
plot(t3)
text(t3,pretty=0)

d4=read.csv("../msc/wine.csv")
d4$class=as.factor(d4$class)
t4=tree(class~.,data=d4)
summary(t4)

Classification tree:
tree(formula = class ~ ., data = d4)
Variables actually used in tree construction:
[1] "Flavanoids" "Color"      "Malic.acid" "Proline"    "Alcohol"   
Number of terminal nodes:  7 
Residual mean deviance:  0.08779 = 15.01 / 171 
Misclassification error rate: 0.01685 = 3 / 178 

t4
node), split, n, deviance, yval, (yprob)
      * denotes terminal node

 1) root 178 386.600 2 ( 0.33146 0.39888 0.26966 )  
   2) Flavanoids < 1.575 62  66.240 3 ( 0.00000 0.22581 0.77419 )  
     4) Color < 3.825 13   0.000 2 ( 0.00000 1.00000 0.00000 ) *
     5) Color 3.825 49   9.763 3 ( 0.00000 0.02041 0.97959 )  
      10) Malic.acid < 1.675 5   5.004 3 ( 0.00000 0.20000 0.80000 ) *
      11) Malic.acid 1.675 44   0.000 3 ( 0.00000 0.00000 1.00000 ) *
   3) Flavanoids 1.575 116 160.800 1 ( 0.50862 0.49138 0.00000 )  
     6) Proline < 724.5 54   9.959 2 ( 0.01852 0.98148 0.00000 )  
      12) Alcohol < 13.08 49   0.000 2 ( 0.00000 1.00000 0.00000 ) *
      13) Alcohol 13.08 5   5.004 2 ( 0.20000 0.80000 0.00000 ) *
     7) Proline 724.5 62  29.660 1 ( 0.93548 0.06452 0.00000 )  
      14) Color < 3.55 5   5.004 2 ( 0.20000 0.80000 0.00000 ) *
      15) Color 3.55 57   0.000 1 ( 1.00000 0.00000 0.00000 ) *
plot(t4)
text(t4,pretty=0)

t4.predict=predict(t4,d4,type="class")
table(t4.predict,d4$class)

          
t4.predict  1  2  3
         1 57  0  0
         2  2 70  0
         3  0  1 48

same error rate as scaled KNN, auto selection (Malic.acid had poor dispersion)

cv.t4=cv.tree(t4)
$size
[1] 7 6 5 4 3 2 1

$dev
[1] 107.0074 102.6637 103.0539 113.4473 157.2000 259.5215 391.7676

$k
[1]       -Inf   4.759068   4.955310  24.658895  56.472669 121.153408 159.618263

$method
[1] "deviance"

attr(,"class")
[1] "prune"         "tree.sequence"


cv.t4=cv.tree(t4,FUN=prune.misclass)
$size
[1] 7 5 4 3 1

$dev
[1] 15 15 16 25 99

$k
[1] -Inf    0    3   13   44

$method
[1] "misclass"

attr(,"class")
[1] "prune"         "tree.sequence"

plot(cv.t4$size, cv.t4$dev,type="b")
plot(cv.t4$k, cv.t4$dev,type="b")
prune.t4=prune.misclass(t4,best=7)
plot(prune.t4)
text(prune.t4,pretty=0)

prune.t4=prune.misclass(t4,best=5)
plot(prune.t4)
text(prune.t4,pretty=0)
p.t4.predict=predict(prune.t4,d4,type="class")
table(p.t4.predict,d4$class)
            
p.t4.predict  1  2  3
           1 57  0  0
           2  2 70  0
           3  0  1 48


prune.t4=prune.misclass(t4,best=4)
plot(prune.t4)
text(prune.t4,pretty=0)
p.t4.predict=predict(prune.t4,d4,type="class")
table(p.t4.predict,d4$class)
p.t4.predict  1  2  3
           1 58  4  0
           2  1 66  0
           3  0  1 48

----
plan: join trump with states...there are good ways within R to do joins based
on a key, but here things are in the same order and can just create a new column

d=read.csv("states_m.csv")
d=d[!d$StateAbbreviation=="DC",]

dt=read.csv("trump.csv")
dt=dt[!dt$StateAbbreviation=="DC",]

dt$StateAbbreviation==d$StateAbbreviation

d$trump=dt$Trump
d$trump=as.factor(d$trump)
t5=tree(trump~.- StateAbbreviation - Name, data = d)

plot(t5)
text(t5,pretty=0)

t6=tree(trump~.- StateAbbreviation - Name, data = d,minsize=3)
plot(t6)
text(t6,pretty=0)
t6

t6.predict=predict(t6,d,type="class")
table(t6.predict,d$trump)
t6.predict FALSE TRUE
     FALSE    20    0
     TRUE      0   30


cv.t6=cv.tree(t6,FUN=prune.misclass)
cv.t6

prune.t6=prune.misclass(t6,best=3)
plot(prune.t6)
text(prune.t6,pretty=0)

prune.t6=prune.misclass(t6,best=4)
plot(prune.t6)
text(prune.t6,pretty=0)


 "If there is no tree in the sequence of the requested size, the
 next largest is returned."
 
d2=read.csv("../msc/WHO19.csv")
is=function(i){return(sum(is.na(d2[,i])))}
sapply(1:77,is)
 [1]   0   0   0  16  62  15  14  14   0   0   0  14   0   1  12  85  13   0   0
[20]   0   0   0   0   2   2  58   0   2   2   0   0  13  15  64  19  14  20  36
[39] 111  29  48  15  14  34  40  64  49  64  16  13  17  30  11  64  28  14  33
[58]  62  62  61  24  24  28  38  47  32  14  28  26  28  17  66  13  15  21  13
[77]  13

names(d2)[sapply(1:77,is)<20]
d2=d2[,sapply(1:77,is)<20]

 
 [4] "fertility."                                               
 [5] "GNIperCapita"                                             
 [6] "school_enrolmentF."                                       
 [7] "school_enrolmentM."                                       
[10] "urban."                                                   
[11] "median_age"                                               
[14] "Births_Skilled."                                          
[19] "health."                                                  
[22] "cancer"                                                   
[23] "cardiovascular"                                           
[24] "tuberculosis"                                             
[25] "lifeFemale"                                               
[26] "lifeMale"                                                 
[27] "Infant_mortalityFemale"                                   
[28] "Infant_mortalityMale"                                     
[29] "alcohol"                                                  
[30] "water_acces."                                             
[33] "CO2_emissions"                                            
[34] "Cell_per"                                                 
[36] "Health_expenditure_per_person"                            
[38] "Income_per_person"                                        
[40] "Population_growth"                                        
[45] "Urban_pop."                                               
d3=d2[,c(4:7,10,11,14,19,22:30,33,34,36,38,40,45)]
T1=tree(Population_growth~.,data=d3)

node), split, n, deviance, yval
      * denotes terminal node

 1) root 137 158.8000  1.4480  
   2) median_age < 22.5 59  32.2100  2.2730  
     4) median_age < 18.5 24   8.5590  2.7850  
       8) Cell_per < 2.35 6   1.0050  3.4620 *
       9) Cell_per > 2.35 18   3.8860  2.5590 *
     5) median_age > 18.5 35  13.0700  1.9220  
      10) Births_Skilled. < 96 30   8.1570  2.0290  
        20) health. < 5.8 21   5.4070  2.1880 *
        21) health. > 5.8 9   0.9861  1.6590 *
      11) Births_Skilled. > 96 5   2.5020  1.2800 *
   3) median_age > 22.5 78  56.0300  0.8238  
     6) median_age < 31.5 44  16.9300  1.2920  
      12) alcohol < 0.245 8   4.8690  2.1080 *
      13) alcohol > 0.245 36   5.5500  1.1100  
        26) urban. < 55 15   2.1520  0.8533 *
        27) urban. > 55 21   1.7010  1.2940 *
     7) median_age > 31.5 34  17.0200  0.2185  
      14) cardiovascular < 355 20   5.6890  0.6565 *
      15) cardiovascular > 355 14   2.0120 -0.4071 *
plot(T1)
text(T1,pretty=0)


LM1=lm(Population_growth~median_age+Cell_per+Births_Skilled.+health.+alcohol+urban.+cardiovascular ,data=d3)
Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      5.4677346  0.3423813  15.970  < 2e-16 ***
median_age      -0.1068081  0.0129095  -8.274 6.11e-14 ***
Cell_per         0.0078029  0.0029835   2.615  0.00981 ** 
Births_Skilled. -0.0141954  0.0031459  -4.512 1.28e-05 ***
health.         -0.0726637  0.0283627  -2.562  0.01138 *  
alcohol         -0.0200218  0.0205374  -0.975  0.33116    
urban.           0.0110712  0.0033516   3.303  0.00119 ** 
cardiovascular  -0.0012232  0.0004743  -2.579  0.01085 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.6656 on 152 degrees of freedom
  (33 observations deleted due to missingness)
Multiple R-squared:  0.6546,	Adjusted R-squared:  0.6387 

plot(LM1)
LM1.predict=predict(LM1,d3)

T1.predict=predict(T1,d3)
plot(d3$Population_growth,T1.predict)
points(d3$Population_growth,LM1.predict,pch=17,col="red")
abline(a=0,b=1)

library(randomForest)
str(d3)
'data.frame':	193 obs. of  23 variables:

bag1=randomForest(Population_growth~.,data=d3,mtry=22,importance=T,na.action=na.omit)
bag1

Call:
 randomForest(formula = Population_growth ~ ., data = d3, mtry = 22,      importance = T, na.action = na.omit) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 22

          Mean of squared residuals: 0.3955005
                    % Var explained: 65.88

medv~.,data=Boston,subset=train,mtry=13,importance=T)

bag1.predict=predict(bag1,d3)
plot(d3$Population_growth,bag1.predict)
points(d3$Population_growth,LM1.predict,pch=17,col="red")
abline(a=0,b=1)


sqrt(mean((d3$Population_growth-LM1.predict)^2,na.rm=T))
[1] 0.6487129

sqrt(mean((d3$Population_growth-bag1.predict)^2,na.rm=T))
[1] 0.2571791

sqrt(mean((d3$Population_growth-T1.predict)^2,na.rm=T))
[1] 0.5563361

sqrt(mean((d3$Population_growth-mean(d3$Population_growth,na.rm=T))^2,na.rm=T))
[1] 1.078164
> importance(bag1)
                                %IncMSE IncNodePurity
fertility.                     3.288930     2.2928151
GNIperCapita                   5.614073     2.5730909
school_enrolmentF.             2.055079     2.4902595
school_enrolmentM.             1.431448     1.5082992
urban.                         6.344347     1.7919665
median_age                    46.392799    88.5949111
Births_Skilled.               10.926804    11.8708178
health.                        1.374423     2.3158452
cancer                         4.928614     3.1325513
cardiovascular                15.009998     8.2696375
tuberculosis                   1.404682     1.9399545
lifeFemale                     3.135624     0.9368174
lifeMale                       5.105820     0.9166199
Infant_mortalityFemale         2.911213     1.5385287
Infant_mortalityMale           5.700500     4.4733147
alcohol                        6.227717     4.3305476
water_acces.                   3.226660     1.8028653
CO2_emissions                  3.964172     4.6876905
Cell_per                       1.019653     2.4956734
Health_expenditure_per_person  5.722525     2.6393081
Income_per_person              6.462919     3.0029285
Urban_pop.                     4.849737     1.7411593

rf1=randomForest(Population_growth~.,data=d3,importance=T,na.action=na.omit)
rf1

Call:
 randomForest(formula = Population_growth ~ ., data = d3, importance = T,      na.action = na.omit) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 7

          Mean of squared residuals: 0.4500389
                    % Var explained: 61.17

rf1=randomForest(Population_growth~.,data=d3,mtry=15,importance=T,na.action=na.omit)
rf1

Call:
 randomForest(formula = Population_growth ~ ., data = d3, mtry = 15,      importance = T, na.action = na.omit) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 15

          Mean of squared residuals: 0.4136327
                    % Var explained: 64.31
importance(rf1)
                                %IncMSE IncNodePurity
fertility.                     6.168043      3.366551
GNIperCapita                   5.931384      2.948092
school_enrolmentF.             1.795017      2.870562
school_enrolmentM.             1.056146      1.872350
urban.                         5.672027      2.212404
median_age                    35.407016     67.548422
Births_Skilled.               10.805775     16.652717
health.                        1.472707      2.494404
cancer                         6.286206      3.096658
cardiovascular                13.846686      7.160135
tuberculosis                   3.468715      2.449866
lifeFemale                     4.077847      2.320314
lifeMale                       4.480194      1.729720
Infant_mortalityFemale         4.898162      2.574704
Infant_mortalityMale           8.043257      4.719579
alcohol                        7.323981      4.813016
water_acces.                   4.793205      4.411790
CO2_emissions                  7.783108      9.182489
Cell_per                       1.980471      3.752588
Health_expenditure_per_person  4.649334      3.297062
Income_per_person              5.691236      2.839730
Urban_pop.                     3.720792      2.130173

library(gbm)
Loaded gbm 2.1.5
d4=d3[!is.na(d3$Population_growth),]

boost=gbm(Population_growth~.,data=d4,,distribution="gaussian",n.trees=5000,
interaction.depth=4)

 summary(boost)

boost.predict=predict(boost,d4,n.trees=5000)
plot(d4$Population_growth,boost.predict)

train=sample(1:nrow(d4),125)
boost2=gbm(Population_growth~.,data=d4[train,],,distribution="gaussian",n.trees=5000,
interaction.depth=4)
boost2.predict=predict(boost2,newdata=d4[-train,],n.trees=5000)

plot(d4$Population_growth[-train],boost2.predict)