math: (a-(a+b+c)/3)^2+ (b-(a+b+c)/3)^2+ (c-(a+b+c)/3)^2 Expand[%] ((a-b)^2+(b-c)^2+(c-a)^2)/3 Expand[%] (a-(a+b+c+d)/4)^2+ (b-(a+b+c+d)/4)^2+ (c-(a+b+c+d)/4)^2 +(d-(a+b+c+d)/4)^2 Expand[%] ((a-b)^2+(a-c)^2+(a-d)^2+(b-c)^2+(b-d)^2+(c-d)^2)/4 Expand[%] ------ d=as.data.frame(matrix(rnorm(50*2), ncol=2)) names(d)=c("x","y") plot(y~x,data=d) d$x[1:25]=d$x[1:25]+3 d$y[1:25]=d$y[1:25]+2 d$class=2 d$class[1:25]=1 cpick=c("red","blue","green") spick=c(16,17,15) plot(y~x,data=d,pch=spick[d$class],col=cpick[d$class]) km.out=kmeans(d[,1:2],2,nstart=20) km.out$cluster plot(y~x,data=d,pch=spick[d$class],col=cpick[km.out$cluster]) table(km.out$cluster,d$class) km.out=kmeans(d[,1:2],3,nstart=20) km.out$cluster plot(y~x,data=d,pch=spick[d$class],col=cpick[km.out$cluster]) table(km.out$cluster,d$class) > km.out K-means clustering with 3 clusters of sizes 23, 15, 12 Cluster means: x y 1 -0.2150022 -0.006394591 2 3.0333951 2.562228516 3 2.2369879 0.686216586 Clustering vector: [1] 3 2 3 2 2 2 3 3 2 2 2 2 1 2 2 3 3 2 2 2 3 3 3 2 2 1 1 3 1 1 1 1 1 1 1 1 1 1 [39] 1 1 3 1 1 1 1 1 1 1 1 3 Within cluster sum of squares by cluster: [1] 36.08941 19.97875 15.82132 (between_SS / total_SS = 70.2 %) Available components: [1] "cluster" "centers" "totss" "withinss" "tot.withinss" [6] "betweenss" "size" "iter" "ifault" > setwd("~/dataScience/data/msc") > getwd() [1] "/home/tkirkman/dataScience/data/msc" D=read.csv("wine.csv") nova=function(i){return(summary(aov(D[,i]~D$class))[[1]][1,4])} h=sapply(2:14,nova) r=order(h,decreasing = T)+1 names(D)[r[1:5]] D1=scale(D[,r[1:5]]) hc.complete=hclust(dist(D1), method="complete") hc.average=hclust(dist(D1), method="average") hc.single=hclust(dist(D1), method="single") plot(hc.complete,main="complete",cex=.9) plot(hc.average,main="average",cex=.9) plot(hc.single,main="single",cex=.9) hc.c3=cutree(hc.complete,3) table(hc.c3,D$class) hc.c5=cutree(hc.complete,5) table(hc.c5,D$class) hc.a3=cutree(hc.average,3) table(hc.c3,D$class) hc.a5=cutree(hc.average,5) table(hc.a5,D$class) plot(D[,r[1:2]],pch=spick[D$class],col=cpick[D$class]) plot(D[,r[1:2]],pch=spick[D$class],col=cpick[hc.c3]) km.out=kmeans(D1,3,nstart=20) table(km.out$cluster,D$class) plot(D[,r[1:2]],pch=spick[D$class],col=cpick[km.out$cluster]) hc.complete=hclust(dist(d[,1:2]), method="complete") hc.average=hclust(dist(d[,1:2]), method="average") hc.single=hclust(dist(d[,1:2]), method="single") plot(hc.complete,main="complete",cex=.9) plot(hc.average,main="average",cex=.9) plot(hc.single,main="single",cex=.9) hc.c2=cutree(hc.complete,2) table(hc.c2,d$class) ---- library(ISLR) nci.labs=NCI60$labs nci.data=NCI60$data str(nci.labs) chr [1:64] "CNS" "CNS" "CNS" "RENAL" "BREAST" "CNS" "CNS" "BREAST" "NSCLC" ... str(nci.data) num [1:64, 1:6830] 0.3 0.68 0.94 0.28 0.485 ... - attr(*, "dimnames")=List of 2 ..$ : chr [1:64] "V1" "V2" "V3" "V4" ... ..$ : chr [1:6830] "1" "2" "3" "4" ... summary(nci.data[,1:16]) the gene expression data have been somehow standardized to median=0, but with very different sd pr.out=prcomp(nci.data, scale=T) Cols=function(vec){ cols=rainbow(length(unique(vec))) return(cols[as.numeric(as.factor(vec))]) } > Cols(c(1,3,5)) [1] "#FF0000FF" "#00FF00FF" "#0000FFFF" plot(pr.out$x[,1:2],col=Cols(nci.labs),pch=19) plot(pr.out$x[,c(1,3)],col=Cols(nci.labs),pch=19) summary(pr.out) plot(pr.out) pr.out$sdev^2 plot(cumsum(pr.out$sdev^2)) plot(cumsum(pr.out$sdev^2),ylim=c(0,7000)) --- sd.data=scale(nci.data) data.dist=dist(sd.data) plot(hclust(data.dist), labels=nci.labs,main="Complete") data5=cutree(hclust(data.dist),5) table(data5,nci.labs) data8=cutree(hclust(data.dist),8) pca.dist=dist(pr.out$x[,1:10]) plot(hclust(pca.dist), labels=nci.labs,main="Complete") pca8=cutree(hclust(pca.dist),8) table(pca8,nci.labs) table(pca8,data8) km.out=kmeans(pr.out$x[,1:10],8,nstart=20) table(pca8,km.out$cluster) plot(hclust(data.dist,method="average"), labels=nci.labs,main="Average") plot(hclust(data.dist,method="single"), labels=nci.labs,main="Single") hc.a5=cutree(hc.average,5) hc.out=hclust(data.dist) hc.clusters=cutree(hc.out,4) table(hc.clusters,nci.labs)