library(class) D=read.csv("wilt_training.csv") > str(D) 'data.frame': 4339 obs. of 6 variables: $ class: Factor w/ 2 levels "n","w": 2 2 2 2 2 2 2 2 2 2 ... $ GLCM : num 120 125 135 128 135 ... $ Green: num 206 203 199 178 197 ... $ Red : num 119.4 115.3 116.9 92.4 112.7 ... $ NIR : num 417 354 478 278 533 ... $ SD : num 20.7 16.7 22.5 15 17.6 ... > D2=read.csv("wilt_testing.csv") > str(D2) 'data.frame': 500 obs. of 6 variables: $ class: Factor w/ 2 levels "n","w": 1 1 1 1 2 1 2 2 1 1 ... $ GLCM : num 110 130 131 141 121 ... $ Green: num 184 213 185 181 218 ... $ Red : num 83 96.9 85.5 81.5 112 ... $ NIR : num 252 482 420 348 427 ... $ SD : num 16.1 21.2 13.3 18.2 19.1 ... > table(D$class) n w 4265 74 > table(D2$class) n w 313 187 training set fraction w way less than in testing predict=knn(D[,2:6],D2[,2:6],D[,1],k=1) table(predict,D2[,1]) > predict=knn(D[,2:6],D2[,2:6],D[,1],k=1) > table(predict,D2[,1]) predict n w n 306 89 w 7 98 > (306+98)/500 [1] 0.808 > 98/187 [1] 0.5240642 > predict=knn(D[,2:6],D2[,2:6],D[,1],k=3) > table(predict,D2[,1]) predict n w n 309 114 w 4 73 > (309+73)/500 [1] 0.764 > 73/187 [1] 0.3903743 > predict=knn(D[,2:6],D2[,2:6],D[,1],k=5) > table(predict,D2[,1]) predict n w n 312 136 w 1 51 > (312+51)/500 [1] 0.726 > 51/187 [1] 0.2727273 rate of false positives goes down, false negatives goes up really should use scaled variables (average distance between a coordinate is basically SD) Stdev=apply(D[,2:6], 2, sd) Avg=colMeans(D[,2:6]) Dscaled=scale(D[,2:6]) D2scaled=scale(D2[,2:6],center=Avg,scale=Stdev) GLCM Green Red NIR SD 126.83130 233.90691 117.29244 534.10468 24.92459 > Stdev GLCM Green Red NIR SD 13.73584 60.75769 60.71116 154.49550 11.00830 or library(matrixStats) colSds(as.matrix(D[,2:6])) so mostly NIR Note: D2scaled mean/stdev are not 0/1, as we used the testing mean/stdev > colMeans(D2scaled) GLCM Green Red NIR SD 0.01708518 -0.39730517 -0.15735531 -0.52020811 -0.38909724 > apply(D2scaled, 2, sd) GLCM Green Red NIR SD 0.7766212 1.2949433 1.1822050 1.0110218 0.6138386 Report the accuracy for $k=1,3,5$. > predict=knn(Dscaled,D2scaled,D[,1],k=1) > table(predict,D2[,1]) predict n w n 306 128 w 7 59 > (306+59)/500 [1] 0.73 > 59/187 [1] 0.315508 > predict=knn(Dscaled,D2scaled,D[,1],k=3) > table(predict,D2[,1]) predict n w n 308 147 w 5 40 > (308+40)/500 [1] 0.696 > 40/187 [1] 0.2139037 > predict=knn(Dscaled,D2scaled,D[,1],k=5) > table(predict,D2[,1]) predict n w n 310 160 w 3 27 > (310+27)/500 [1] 0.674 > 27/187 [1] 0.144385 scaling seems worse...must be do not want equal weighting additional problem: train is vastly more F, so hard to get majority T bias to pull some F measured-wilted into T measured-wilted Color=c(rgb(.1,.1,.1,.05), rgb(1,0,0,1)) Shape=c(3,17) pairs(D[,2:6],pch=Shape[as.numeric(D$class)],col=Color[as.numeric(D$class)]) pairs(log(D[,2:6]),pch=Shape[as.numeric(D$class)],col=Color[as.numeric(D$class)]) plot(D[,3],D[,4],pch=Shape[as.numeric(D$class)],col=Color[as.numeric(D$class)]) lm(formula = log(Red) ~ log(Green), data = D, subset = D$class == "n") (Intercept) -4.117926 0.035397 -116.3 <2e-16 *** log(Green) 1.621836 0.006508 249.2 <2e-16 *** Residual standard error: 0.07949 on 4263 degrees of freedom Multiple R-squared: 0.9358, Adjusted R-squared: 0.9358 > diff=log(D$Red)-(1.622*log(D$Green)-4.118) > d=D[,c(5,6)] > d$diff=diff > d2=D2[,c(5,6)] > diff2=log(D2$Red)-(1.622*log(D2$Green)-4.118) > d2$diff=diff2 > stdev=apply(d, 2, sd) > avg=colMeans(d) > dscaled=as.data.frame(scale(d)) > d2scaled=as.data.frame(scale(d2,center=avg,scale=stdev)) > predict=knn(dscaled,d2scaled,D[,1],k=1) > table(predict,D2[,1]) predict n w n 303 106 w 10 81 > (303+81)/500 [1] 0.768 > 81/187 [1] 0.4331551 > predict=knn(dscaled,d2scaled,D[,1],k=3) > table(predict,D2[,1]) predict n w n 307 111 w 6 76 > (307+76)/500 [1] 0.766 > 76/187 [1] 0.4064171 > predict=knn(dscaled,d2scaled,D[,1],k=5) > table(predict,D2[,1]) predict n w n 308 113 w 5 74 > (308+74)/500 [1] 0.764 > 74/187 [1] 0.3957219 > predict=knn(dscaled,d2scaled,D[,1],prob=T,k=5) > predict2= (predict=="w") | (predict=="n" & attributes(predict)$prob <=.6) > table(predict2,D2[,1]) predict2 n w FALSE 303 94 TRUE 10 93 > (93+303)/500 [1] 0.792 > (93/187) [1] 0.4973262 > predict2= (predict=="w") | (predict=="n" & attributes(predict)$prob <=.8) > table(predict2,D2[,1]) predict2 n w FALSE 276 62 TRUE 37 125 > (276+125)/500 [1] 0.802 > 125/187 [1] 0.6684492 ---- library(MASS) dscaled$class=D[,1] d2scaled$class=D2[,1] lda1=lda(class~diff+SD+NIR,data=dscaled) > lda1 Call: lda(class ~ diff + SD + NIR, data = dscaled) Prior probabilities of groups: n w 0.98294538 0.01705462 Group means: diff SD NIR n -0.05236526 0.007236775 0.01307415 w 3.01807860 -0.417092500 -0.75353054 Coefficients of linear discriminants: LD1 diff 1.1865514 SD -0.1661454 NIR 0.2888952 lda.predict=predict(lda1,d2scaled) > table(lda.predict$class,D2[,1]) n w n 303 95 w 10 92 > (303+92)/500 [1] 0.79 > (92)/187 [1] 0.4919786 > hist(lda.predict$x[d2scaled$class=="n"],breaks=seq(-8,6,.5),col=rgb(0,0,1,.25)) > hist(lda.predict$x[d2scaled$class=="w"],breaks=seq(-8,6,.5),col=rgb(1,0,0,.25),add=T) table(lda.predict$x>2.5,D2[,1]) n w FALSE 298 71 TRUE 15 116 > (298+116)/500 [1] 0.828 > 116/187 [1] 0.6203209 > table(lda.predict$x>2.25,D2[,1]) n w FALSE 293 53 TRUE 20 134 > (293+134)/500 [1] 0.854 > 134/187 [1] 0.7165775 > table(lda.predict$x>2.0,D2[,1]) n w FALSE 281 29 TRUE 32 158 > (281+158)/500 [1] 0.878 > 158/187 [1] 0.8449198 > table(lda.predict$x>1.75,D2[,1]) n w FALSE 270 16 TRUE 43 171 > (270+171)/500 [1] 0.882 > 171/187 [1] 0.9144385 > table(lda.predict$x>1.5,D2[,1]) n w FALSE 251 9 TRUE 62 178 > (251+178)/500 [1] 0.858 > 178/187 [1] 0.9518717 qda1=qda(class~diff+SD+NIR,data=dscaled) > qda1 Call: qda(class ~ diff + SD + NIR, data = dscaled) Prior probabilities of groups: n w 0.98294538 0.01705462 Group means: diff SD NIR n -0.05236526 0.007236775 0.01307415 w 3.01807860 -0.417092500 -0.75353054 qda.predict=predict(qda1,d2scaled) table(qda.predict$class,D2[,1]) n w n 297 88 w 16 99 > (297+99)/500 [1] 0.792 > 99/187 [1] 0.5294118 > table(qda.predict$posterior[,1]-qda.predict$posterior[,2]<.9,D2[,1]) n w FALSE 271 32 TRUE 42 155 > (271+155)/500 [1] 0.852 > 155/187 [1] 0.828877