> df=read.csv("titanic.csv") > str(df) 'data.frame': 887 obs. of 6 variables: $ Survived: logi FALSE TRUE TRUE TRUE FALSE FALSE ... $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ... $ Name : Factor w/ 887 levels "Capt. Edward Gifford Crosby",..: 602 823 172 814 733 464 700 33 842 839 ... $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ... $ Age : num 22 38 26 35 35 27 54 2 27 14 ... $ Fare : num 7.25 71.28 7.92 53.1 8.05 ... mean(df$Survived[df$Sex=="female"]) [1] 0.7420382 > (91+70+72)/(91+70+72+3+6+72) [1] 0.7420382 mean(df$Survived[df$Sex=="female" & df$Pclass==1]) [1] 0.9680851 > (91)/(91+3) [1] 0.9680851 mean(df$Survived[df$Sex=="female" & df$Pclass==2]) [1] 0.9210526 > 70/(70+6) [1] 0.9210526 mean(df$Survived[df$Sex=="female" & df$Pclass==3]) [1] 0.5 > 72/(72+72) [1] 0.5 > mean(df$Survived[df$Sex=="male"]) [1] 0.1902269 > mean(df$Survived[df$Sex=="male" & df$Pclass==1]) [1] 0.3688525 > mean(df$Survived[df$Sex=="male" & df$Pclass==2]) [1] 0.1574074 > mean(df$Survived[df$Sex=="male" & df$Pclass==3]) [1] 0.1370262 > (45+17+47)/(45+17+47+77+91+296) [1] 0.1902269 > (45)/(45+77) [1] 0.3688525 > (17)/(17+91) [1] 0.1574074 > (47)/(47+296) [1] 0.1370262 (91+70+72)/(91+70+72+3+6+72)= 91/(91+3)*(91+3)/(91+70+72+3+6+72) 70/(70+6)*(70+6)/(91+70+72+3+6+72) 72/(72+72)*(72+72)/(91+70+72+3+6+72) (91+45)/(91+70+72+45+17+47) * (91+70+72+45+17+47)/877 (91+45)/(94+144) * (94+144)/877 (91+70+72)/877 .ne. 342/877 * 314/877 > a=fisher.test(fout) > a Fisher's Exact Test for Count Data data: fout p-value < 2.2e-16 alternative hypothesis: two.sided > b=chisq.test(mout) > b Pearson's Chi-squared test data: mout X-squared = 32.328, df = 2, p-value = 9.552e-08 > b$expected Pclass Survived 1 2 3 FALSE 98.79232 87.4555 277.75218 TRUE 23.20768 20.5445 65.24782 main difference is in 1st class survival: less was expected young females & 24-32 males more common > pM=hist(df$Age[df$Sex=="male"],xlim=c(0,80),breaks=seq(0,80,8),freq=F) > pF=hist(df$Age[df$Sex=="female"],xlim=c(0,80),breaks=seq(0,80,8),freq=F) 1st class: 8-24 females, and generally older men likely > pM=hist(df$Age[df$Sex=="male" & df$Pclass==1],xlim=c(0,80),breaks=seq(0,80,8),freq=F) > pF=hist(df$Age[df$Sex=="female" & df$Pclass==1],xlim=c(0,80),breaks=seq(0,80,8),freq=F) 1/3 men: young 3rd class & older 1st class men likely > pM=hist(df$Age[df$Sex=="male" & df$Pclass==1],xlim=c(0,80),breaks=seq(0,80,8),freq=F) > pF=hist(df$Age[df$Sex=="male" & df$Pclass==3],xlim=c(0,80),breaks=seq(0,80,8),freq=F) survive/dead 0-8 survived; 18-24 dead > pM=hist(df$Age[df$Survived],xlim=c(0,80),breaks=seq(0,80,8),freq=F) > pF=hist(df$Age[! df$Survived],xlim=c(0,80),breaks=seq(0,80,8),freq=F) count men surived/total: 0-8 best bet to survive > pM=hist(df$Age[df$Sex=="male"],xlim=c(0,80),breaks=seq(0,80,8)) > pM=hist(df$Age[df$Sex=="male" & df$Survived],xlim=c(0,80),breaks=seq(0,80,8)) count men survive/total 1st class very young 100%, middle age ~50% > pF=hist(df$Age[df$Sex=="male" & df$Pclass==1],xlim=c(0,80),breaks=seq(0,80,8)) > pM=hist(df$Age[df$Sex=="male" & df$Survived & df$Pclass==1],xlim=c(0,80),breaks=seq(0,80,8)) count men survive/total 2nd class very young survived > pF=hist(df$Age[df$Sex=="male" & df$Pclass==2],xlim=c(0,80),breaks=seq(0,80,8)) > pM=hist(df$Age[df$Sex=="male" & df$Survived & df$Pclass==2],xlim=c(0,80),breaks=seq(0,80,8)) count men survive/total 3rd class not good for any, but young better than old > pF=hist(df$Age[df$Sex=="male" & df$Pclass==3],xlim=c(0,80),breaks=seq(0,80,8)) > pM=hist(df$Age[df$Sex=="male" & df$Survived & df$Pclass==3],xlim=c(0,80),breaks=seq(0,80,8)) count female survived pretty good for all age groups, old was best > pF=hist(df$Age[df$Sex=="female"],xlim=c(0,80),breaks=seq(0,80,8)) > pM=hist(df$Age[df$Sex=="female" & df$Survived ],xlim=c(0,80),breaks=seq(0,80,8)) t.test(df$Age[df$Sex=="female" & df$Pclass==1],df$Age[df$Sex=="male" & df$Pclass==1]) t = -3.3197, df = 206.89, p-value = 0.001065 females 35 vs 41 > t.test(df$Age[df$Sex=="male" & df$Pclass==3],df$Age[df$Sex=="male" & df$Pclass==1]) t = -10.402, df = 183.16, p-value < 2.2e-16 3rd: 26.5 vs 1st 41.5 > df2=df[df$Fare>0,] > boxplot(df2$Fare ~ df2$Pclass,log="Y") Error in plot.window(xlim = xlim, ylim = ylim, log = log, yaxs = pars$yaxs) : invalid "log=Y" specification > boxplot(df2$Fare ~ df2$Pclass,log="y") > which.min(df2$Fare[df2$Pclass==1]) [1] 208 > (df2[df2$Pclass==1,])[208,] ...........note: ()[208,] Survived Pclass Name Sex Age Fare 869 FALSE 1 Mr. Frans Olof Carlsson male 33 5