library(tree) d=read.csv("states_m.csv") t=tree(GiniIndex~.- StateAbbreviation-Name,data=d) summary(t) Regression tree: tree(formula = GiniIndex ~ . - StateAbbreviation - Name, data = d) Variables actually used in tree construction: [1] "BusinessEthnicOwnershipFractionsBlack" [2] "MedianHouseholdIncome" [3] "FarmLandArea" [4] "MurderNonnegligentManslaughterRate" [5] "AverageACTMathematicsScore" [6] "BusinessEthnicOwnershipFractionsWhite" Number of terminal nodes: 7 Residual mean deviance: 8.613e-05 = 0.003704 / 43 Distribution of residuals: Min. 1st Qu. Median Mean 3rd Qu. Max. -0.0193700 -0.0058420 -0.0008339 0.0000000 0.0058570 0.0235700 plot(t) text(t,pretty=0) lm1=lm(GiniIndex ~ BusinessEthnicOwnershipFractionsBlack+MedianHouseholdIncome+ FarmLandArea+MurderNonnegligentManslaughterRate+AverageACTMathematicsScore+ BusinessEthnicOwnershipFractionsWhite, data=d) t.predict=predict(t,d) plot(d$GiniIndex,t.predict) lm.predict=predict(lm1,d) points(d$GiniIndex,lm.predict,pch=17,col="red") abline(a=0,b=1) tB=tree(GiniIndex~.- StateAbbreviation-Name,data=d,minsize=3) summary(tB) Regression tree: tree(formula = GiniIndex ~ . - StateAbbreviation - Name, data = d, minsize = 3) Variables actually used in tree construction: [1] "BusinessEthnicOwnershipFractionsBlack" [2] "PersonsPerHousehold" [3] "BurglaryRate" [4] "PerCapitaPersonalIncome" [5] "AverageACTReadingScore" [6] "HealthInsuranceCoverageRate" [7] "MurderNonnegligentManslaughterRate" [8] "AverageACTMathematicsScore" [9] "ForcibleRapeRate" [10] "AverageSATWritingScore" Number of terminal nodes: 12 Residual mean deviance: 2.168e-05 = 0.0008237 / 38 Distribution of residuals: Min. 1st Qu. Median Mean 3rd Qu. Max. -0.009400 -0.002478 -0.000308 0.000000 0.002316 0.010910 plot(tB) text(tB,pretty=0) > tB node), split, n, deviance, yval * denotes terminal node 1) root 50 1.763e-02 0.4624 2) BusinessEthnicOwnershipFractionsBlack < 2.85 22 4.274e-03 0.4481 4) PersonsPerHousehold < 2.80488 19 2.505e-03 0.4515 8) BurglaryRate < 0.003406 9 4.160e-04 0.4424 16) PerCapitaPersonalIncome < 55852.5 7 7.139e-05 0.4456 * 17) PerCapitaPersonalIncome > 55852.5 2 2.113e-05 0.4312 * 9) BurglaryRate > 0.003406 10 6.670e-04 0.4597 18) AverageACTReadingScore < 20.65 2 2.592e-05 0.4718 * 19) AverageACTReadingScore > 20.65 8 2.745e-04 0.4567 38) HealthInsuranceCoverageRate < 34 7 3.543e-05 0.4587 * 39) HealthInsuranceCoverageRate > 34 1 0.000e+00 0.4422 * 5) PersonsPerHousehold > 2.80488 3 1.909e-04 0.4268 * 3) BusinessEthnicOwnershipFractionsBlack > 2.85 28 5.324e-03 0.4736 6) MurderNonnegligentManslaughterRate < 5.15e-05 8 9.166e-04 0.4866 12) BurglaryRate < 0.0024845 2 1.201e-04 0.5024 * 13) BurglaryRate > 0.0024845 6 1.286e-04 0.4814 * 7) MurderNonnegligentManslaughterRate > 5.15e-05 20 2.522e-03 0.4685 14) AverageACTMathematicsScore < 20.85 12 5.860e-04 0.4754 28) ForcibleRapeRate < 0.0004675 8 1.803e-04 0.4794 * 29) ForcibleRapeRate > 0.0004675 4 2.571e-05 0.4674 * 15) AverageACTMathematicsScore > 20.85 8 4.879e-04 0.4580 30) AverageSATWritingScore < 480 4 7.607e-06 0.4504 * 31) AverageSATWritingScore > 480 4 1.665e-05 0.4656 * d2=read.csv("../msc/WHO19.csv") is=function(i){return(sum(is.na(d2[,i])))} sapply(1:77,is) [1] 0 0 0 16 62 15 14 14 0 0 0 14 0 1 12 85 13 0 0 [20] 0 0 0 0 2 2 58 0 2 2 0 0 13 15 64 19 14 20 36 [39] 111 29 48 15 14 34 40 64 49 64 16 13 17 30 11 64 28 14 33 [58] 62 62 61 24 24 28 38 47 32 14 28 26 28 17 66 13 15 21 13 [77] 13 names(d2)[sapply(1:77,is)<20] d2=d2[,sapply(1:77,is)<20] t2=tree(CO2_emissions~GNIperCapita+Population+Urban_pop+Internet_users+Health_expenditure_per_person+lifeFemale+school_enrolmentM.,data=d2) plot(t2) text(t2,pretty=0) t3=tree(CO2_emissions~.-Country,data=d2) summary(t3) Variables actually used in tree construction: [1] "GNIperCapita" "Income_per_person" "nurses" [4] "Births_Skilled." "Forest_area" plot(t3) text(t3,pretty=0) d4=read.csv("../msc/wine.csv") d4$class=as.factor(d4$class) t4=tree(class~.,data=d4) summary(t4) Classification tree: tree(formula = class ~ ., data = d4) Variables actually used in tree construction: [1] "Flavanoids" "Color" "Malic.acid" "Proline" "Alcohol" Number of terminal nodes: 7 Residual mean deviance: 0.08779 = 15.01 / 171 Misclassification error rate: 0.01685 = 3 / 178 t4 node), split, n, deviance, yval, (yprob) * denotes terminal node 1) root 178 386.600 2 ( 0.33146 0.39888 0.26966 ) 2) Flavanoids < 1.575 62 66.240 3 ( 0.00000 0.22581 0.77419 ) 4) Color < 3.825 13 0.000 2 ( 0.00000 1.00000 0.00000 ) * 5) Color 3.825 49 9.763 3 ( 0.00000 0.02041 0.97959 ) 10) Malic.acid < 1.675 5 5.004 3 ( 0.00000 0.20000 0.80000 ) * 11) Malic.acid 1.675 44 0.000 3 ( 0.00000 0.00000 1.00000 ) * 3) Flavanoids 1.575 116 160.800 1 ( 0.50862 0.49138 0.00000 ) 6) Proline < 724.5 54 9.959 2 ( 0.01852 0.98148 0.00000 ) 12) Alcohol < 13.08 49 0.000 2 ( 0.00000 1.00000 0.00000 ) * 13) Alcohol 13.08 5 5.004 2 ( 0.20000 0.80000 0.00000 ) * 7) Proline 724.5 62 29.660 1 ( 0.93548 0.06452 0.00000 ) 14) Color < 3.55 5 5.004 2 ( 0.20000 0.80000 0.00000 ) * 15) Color 3.55 57 0.000 1 ( 1.00000 0.00000 0.00000 ) * plot(t4) text(t4,pretty=0) t4.predict=predict(t4,d4,type="class") table(t4.predict,d4$class) t4.predict 1 2 3 1 57 0 0 2 2 70 0 3 0 1 48 same error rate as scaled KNN, auto selection (Malic.acid had poor dispersion) cv.t4=cv.tree(t4) $size [1] 7 6 5 4 3 2 1 $dev [1] 107.0074 102.6637 103.0539 113.4473 157.2000 259.5215 391.7676 $k [1] -Inf 4.759068 4.955310 24.658895 56.472669 121.153408 159.618263 $method [1] "deviance" attr(,"class") [1] "prune" "tree.sequence" cv.t4=cv.tree(t4,FUN=prune.misclass) $size [1] 7 5 4 3 1 $dev [1] 15 15 16 25 99 $k [1] -Inf 0 3 13 44 $method [1] "misclass" attr(,"class") [1] "prune" "tree.sequence" plot(cv.t4$size, cv.t4$dev,type="b") plot(cv.t4$k, cv.t4$dev,type="b") prune.t4=prune.misclass(t4,best=7) plot(prune.t4) text(prune.t4,pretty=0) prune.t4=prune.misclass(t4,best=5) plot(prune.t4) text(prune.t4,pretty=0) p.t4.predict=predict(prune.t4,d4,type="class") table(p.t4.predict,d4$class) p.t4.predict 1 2 3 1 57 0 0 2 2 70 0 3 0 1 48 prune.t4=prune.misclass(t4,best=4) plot(prune.t4) text(prune.t4,pretty=0) p.t4.predict=predict(prune.t4,d4,type="class") table(p.t4.predict,d4$class) p.t4.predict 1 2 3 1 58 4 0 2 1 66 0 3 0 1 48 ---- plan: join trump with states...there are good ways within R to do joins based on a key, but here things are in the same order and can just create a new column d=read.csv("states_m.csv") d=d[!d$StateAbbreviation=="DC",] dt=read.csv("trump.csv") dt=dt[!dt$StateAbbreviation=="DC",] dt$StateAbbreviation==d$StateAbbreviation d$trump=dt$Trump d$trump=as.factor(d$trump) t5=tree(trump~.- StateAbbreviation - Name, data = d) plot(t5) text(t5,pretty=0) t6=tree(trump~.- StateAbbreviation - Name, data = d,minsize=3) plot(t6) text(t6,pretty=0) t6 t6.predict=predict(t6,d,type="class") table(t6.predict,d$trump) t6.predict FALSE TRUE FALSE 20 0 TRUE 0 30 cv.t6=cv.tree(t6,FUN=prune.misclass) cv.t6 prune.t6=prune.misclass(t6,best=3) plot(prune.t6) text(prune.t6,pretty=0) prune.t6=prune.misclass(t6,best=4) plot(prune.t6) text(prune.t6,pretty=0) "If there is no tree in the sequence of the requested size, the next largest is returned." d2=read.csv("../msc/WHO19.csv") is=function(i){return(sum(is.na(d2[,i])))} sapply(1:77,is) [1] 0 0 0 16 62 15 14 14 0 0 0 14 0 1 12 85 13 0 0 [20] 0 0 0 0 2 2 58 0 2 2 0 0 13 15 64 19 14 20 36 [39] 111 29 48 15 14 34 40 64 49 64 16 13 17 30 11 64 28 14 33 [58] 62 62 61 24 24 28 38 47 32 14 28 26 28 17 66 13 15 21 13 [77] 13 names(d2)[sapply(1:77,is)<20] d2=d2[,sapply(1:77,is)<20] [4] "fertility." [5] "GNIperCapita" [6] "school_enrolmentF." [7] "school_enrolmentM." [10] "urban." [11] "median_age" [14] "Births_Skilled." [19] "health." [22] "cancer" [23] "cardiovascular" [24] "tuberculosis" [25] "lifeFemale" [26] "lifeMale" [27] "Infant_mortalityFemale" [28] "Infant_mortalityMale" [29] "alcohol" [30] "water_acces." [33] "CO2_emissions" [34] "Cell_per" [36] "Health_expenditure_per_person" [38] "Income_per_person" [40] "Population_growth" [45] "Urban_pop." d3=d2[,c(4:7,10,11,14,19,22:30,33,34,36,38,40,45)] T1=tree(Population_growth~.,data=d3) node), split, n, deviance, yval * denotes terminal node 1) root 137 158.8000 1.4480 2) median_age < 22.5 59 32.2100 2.2730 4) median_age < 18.5 24 8.5590 2.7850 8) Cell_per < 2.35 6 1.0050 3.4620 * 9) Cell_per > 2.35 18 3.8860 2.5590 * 5) median_age > 18.5 35 13.0700 1.9220 10) Births_Skilled. < 96 30 8.1570 2.0290 20) health. < 5.8 21 5.4070 2.1880 * 21) health. > 5.8 9 0.9861 1.6590 * 11) Births_Skilled. > 96 5 2.5020 1.2800 * 3) median_age > 22.5 78 56.0300 0.8238 6) median_age < 31.5 44 16.9300 1.2920 12) alcohol < 0.245 8 4.8690 2.1080 * 13) alcohol > 0.245 36 5.5500 1.1100 26) urban. < 55 15 2.1520 0.8533 * 27) urban. > 55 21 1.7010 1.2940 * 7) median_age > 31.5 34 17.0200 0.2185 14) cardiovascular < 355 20 5.6890 0.6565 * 15) cardiovascular > 355 14 2.0120 -0.4071 * plot(T1) text(T1,pretty=0) LM1=lm(Population_growth~median_age+Cell_per+Births_Skilled.+health.+alcohol+urban.+cardiovascular ,data=d3) Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 5.4677346 0.3423813 15.970 < 2e-16 *** median_age -0.1068081 0.0129095 -8.274 6.11e-14 *** Cell_per 0.0078029 0.0029835 2.615 0.00981 ** Births_Skilled. -0.0141954 0.0031459 -4.512 1.28e-05 *** health. -0.0726637 0.0283627 -2.562 0.01138 * alcohol -0.0200218 0.0205374 -0.975 0.33116 urban. 0.0110712 0.0033516 3.303 0.00119 ** cardiovascular -0.0012232 0.0004743 -2.579 0.01085 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.6656 on 152 degrees of freedom (33 observations deleted due to missingness) Multiple R-squared: 0.6546, Adjusted R-squared: 0.6387 plot(LM1) LM1.predict=predict(LM1,d3) T1.predict=predict(T1,d3) plot(d3$Population_growth,T1.predict) points(d3$Population_growth,LM1.predict,pch=17,col="red") abline(a=0,b=1) library(randomForest) str(d3) 'data.frame': 193 obs. of 23 variables: bag1=randomForest(Population_growth~.,data=d3,mtry=22,importance=T,na.action=na.omit) bag1 Call: randomForest(formula = Population_growth ~ ., data = d3, mtry = 22, importance = T, na.action = na.omit) Type of random forest: regression Number of trees: 500 No. of variables tried at each split: 22 Mean of squared residuals: 0.3955005 % Var explained: 65.88 medv~.,data=Boston,subset=train,mtry=13,importance=T) bag1.predict=predict(bag1,d3) plot(d3$Population_growth,bag1.predict) points(d3$Population_growth,LM1.predict,pch=17,col="red") abline(a=0,b=1) sqrt(mean((d3$Population_growth-LM1.predict)^2,na.rm=T)) [1] 0.6487129 sqrt(mean((d3$Population_growth-bag1.predict)^2,na.rm=T)) [1] 0.2571791 sqrt(mean((d3$Population_growth-T1.predict)^2,na.rm=T)) [1] 0.5563361 sqrt(mean((d3$Population_growth-mean(d3$Population_growth,na.rm=T))^2,na.rm=T)) [1] 1.078164 > importance(bag1) %IncMSE IncNodePurity fertility. 3.288930 2.2928151 GNIperCapita 5.614073 2.5730909 school_enrolmentF. 2.055079 2.4902595 school_enrolmentM. 1.431448 1.5082992 urban. 6.344347 1.7919665 median_age 46.392799 88.5949111 Births_Skilled. 10.926804 11.8708178 health. 1.374423 2.3158452 cancer 4.928614 3.1325513 cardiovascular 15.009998 8.2696375 tuberculosis 1.404682 1.9399545 lifeFemale 3.135624 0.9368174 lifeMale 5.105820 0.9166199 Infant_mortalityFemale 2.911213 1.5385287 Infant_mortalityMale 5.700500 4.4733147 alcohol 6.227717 4.3305476 water_acces. 3.226660 1.8028653 CO2_emissions 3.964172 4.6876905 Cell_per 1.019653 2.4956734 Health_expenditure_per_person 5.722525 2.6393081 Income_per_person 6.462919 3.0029285 Urban_pop. 4.849737 1.7411593 rf1=randomForest(Population_growth~.,data=d3,importance=T,na.action=na.omit) rf1 Call: randomForest(formula = Population_growth ~ ., data = d3, importance = T, na.action = na.omit) Type of random forest: regression Number of trees: 500 No. of variables tried at each split: 7 Mean of squared residuals: 0.4500389 % Var explained: 61.17 rf1=randomForest(Population_growth~.,data=d3,mtry=15,importance=T,na.action=na.omit) rf1 Call: randomForest(formula = Population_growth ~ ., data = d3, mtry = 15, importance = T, na.action = na.omit) Type of random forest: regression Number of trees: 500 No. of variables tried at each split: 15 Mean of squared residuals: 0.4136327 % Var explained: 64.31 importance(rf1) %IncMSE IncNodePurity fertility. 6.168043 3.366551 GNIperCapita 5.931384 2.948092 school_enrolmentF. 1.795017 2.870562 school_enrolmentM. 1.056146 1.872350 urban. 5.672027 2.212404 median_age 35.407016 67.548422 Births_Skilled. 10.805775 16.652717 health. 1.472707 2.494404 cancer 6.286206 3.096658 cardiovascular 13.846686 7.160135 tuberculosis 3.468715 2.449866 lifeFemale 4.077847 2.320314 lifeMale 4.480194 1.729720 Infant_mortalityFemale 4.898162 2.574704 Infant_mortalityMale 8.043257 4.719579 alcohol 7.323981 4.813016 water_acces. 4.793205 4.411790 CO2_emissions 7.783108 9.182489 Cell_per 1.980471 3.752588 Health_expenditure_per_person 4.649334 3.297062 Income_per_person 5.691236 2.839730 Urban_pop. 3.720792 2.130173 library(gbm) Loaded gbm 2.1.5 d4=d3[!is.na(d3$Population_growth),] boost=gbm(Population_growth~.,data=d4,,distribution="gaussian",n.trees=5000, interaction.depth=4) summary(boost) boost.predict=predict(boost,d4,n.trees=5000) plot(d4$Population_growth,boost.predict) train=sample(1:nrow(d4),125) boost2=gbm(Population_growth~.,data=d4[train,],,distribution="gaussian",n.trees=5000, interaction.depth=4) boost2.predict=predict(boost2,newdata=d4[-train,],n.trees=5000) plot(d4$Population_growth[-train],boost2.predict)