D=read.csv("Police_Killings_PAR_repB.csv") str(D) 'data.frame': 1952 obs. of 13 variables: $ bpop : num 8 8 0 2 0 1 1 30 10 46 ... $ hpop : num 11 8 0 4 1 5 30 5 0 1 ... $ vgen : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ... $ vage : int 33 19 75 33 49 52 31 21 29 53 ... $ vrace1 : Factor w/ 4 levels "Asian","Black",..: 3 NA 4 NA 4 4 4 2 4 4 ... $ orace1 : Factor w/ 2 levels "Non-White","White": NA NA NA NA NA NA NA 2 NA NA ... $ warrant : int 1 1 1 0 0 1 1 1 0 1 ... $ intcause1: int 0 0 1 0 0 0 0 1 0 0 ... $ gunnogun : int 1 1 1 1 0 0 NA 0 1 1 ... $ incQ5 : int 3 3 2 4 3 4 2 2 2 2 ... $ pop5cat : int 4 1 NA 1 1 1 1 1 1 1 ... $ crime2 : num 0.01071 0.0048 NA 0.00411 0.00638 ... $ anyweapon: Factor w/ 3 levels "Gun","Other Weapon",..: 1 1 1 1 2 2 NA 2 1 1 ... > table(is.na(D$pop5cat),is.na(D$crime2)) FALSE TRUE FALSE 1832 3 TRUE 2 115 > table(is.na(D$bpop),is.na(D$incQ5)) FALSE TRUE FALSE 1863 0 TRUE 3 86 > table(is.na(D$bpop),is.na(D$hpop)) FALSE TRUE FALSE 1862 1 TRUE 1 88 > table(is.na(D$orace1),D$vrace1,useNA="ifany") Asian Black Latino White FALSE 14 178 86 333 14 TRUE 17 350 276 626 58 > chisq.test(.Last.value) Pearson's Chi-squared test data: .Last.value X-squared = 22.961, df = 4, p-value = 0.0001289 ### > table(is.na(D$orace1),D$vrace1,useNA="ifany")[,c(2:4)] Black Latino White FALSE 178 86 333 TRUE 350 276 626 > chisq.test(.Last.value) Pearson's Chi-squared test data: .Last.value X-squared = 15.143, df = 2, p-value = 0.000515 ### t.test(D$incQ5[D$vrace1=="Latino" & is.na(D$orace1)],D$incQ5[D$vrace1=="Latino" & ! is.na(D$orace1)]) Welch Two Sample t-test data: D$incQ5[D$vrace1 == "Latino" & is.na(D$orace1)] and D$incQ5[D$vrace1 == "Latino" & !is.na(D$orace1)] t = 1.9255, df = 141.94, p-value = 0.05616 #### alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -0.004582121 0.348585768 sample estimates: mean of x mean of y 2.713178 2.541176 Q: if a Latino is shot and the officer's race is not reported, is the location likely to be in a higher income neighborhood than similar deaths where the officer's race is reported? YES library(dplyr) D1=D[!is.na(D$orace1),] D1$orace1= (D1$orace1=="White") D1b=D1[D1$vrace1 %in% c("White","Black") ,] D1b$vrace1= (D1b$vrace1=="Black") D1h=D1[D1$vrace1 %in% c("White","Latino") ,] D1h$vrace1= (D1h$vrace1=="Latino") colnames(D1b)[c(1,4,5,6,10,11,12)] [1] "bpop" "vage" "vrace1" "orace1" "incQ5" "pop5cat" "crime2" D1b=D1b[,c(1,4,5,6,10,11,12)] D1h=D1h[,c(2,4,5,6,10,11,12)] colnames(D1h) [1] "hpop" "vage" "vrace1" "orace1" "incQ5" "pop5cat" "crime2" > table(D1b$orace1,D1b$vrace1) FALSE TRUE FALSE 29 33 TRUE 304 145 (Q: report the number of black victims that were killed by black officers. %33 d=read.csv("pnas.1903856116.sd01B.csv") sum(d$race=="black")/sum(d$race=="white")/(mean(d$blackPop/d$whitePop)) sum(d$race=="black")/sum(d$race=="white")/(mean(d$blackHom/d$whiteHom)) Q: report the first ratio result. %1.77 hist(d$blackHom/d$whiteHom) hist(log(d$blackHom/d$whiteHom)) sum(d$race=="black")/sum(d$race=="white")/(median(d$blackHom/d$whiteHom, na.rm=T)) Q: report this ratio. %0.6337 > t.test(d$gini[d$race=="black"],d$gini[d$race=="white"]) t = 8.9797, df = 408.14, p-value < 2.2e-16 mean of x mean of y 0.4762290 0.4528238 > t.test(d$income[d$race=="black"],d$income[d$race=="white"]) t = 2.6081, df = 459.35, p-value = 0.009402 mean of x mean of y 54374.06 51804.35 SURPRISE! > t.test(d$popSize[d$race=="black"],d$popSize[d$race=="white"]) t = 4.8042, df = 360.61, p-value = 2.283e-06 mean of x mean of y 1542286.2 798579.7 > t.test(d$blackPop[d$race=="black"],d$blackPop[d$race=="white"]) t = 12.234, df = 348.25, p-value < 2.2e-16 mean of x mean of y 21.7698 8.9998 mb = glm(vrace1~.,family=binomial(link='logit'),data=D1b) summary(mb) Call: glm(formula = vrace1 ~ ., family = binomial(link = "logit"), data = D1b) Deviance Residuals: Min 1Q Median 3Q Max -2.8385 -0.7291 -0.4956 0.6876 2.3430 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -0.269342 0.706470 -0.381 0.703017 bpop 0.055248 0.007676 7.197 6.14e-13 *** vage -0.031575 0.009048 -3.490 0.000483 *** orace1TRUE -0.311455 0.337155 -0.924 0.355604 incQ5 -0.201364 0.172367 -1.168 0.242713 pop5cat 0.234422 0.076579 3.061 0.002205 ** crime2 8.644586 11.925747 0.725 0.468533 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 622.53 on 476 degrees of freedom Residual deviance: 456.11 on 470 degrees of freedom (34 observations deleted due to missingness) AIC: 470.11 Qs: Which variables are significant? bpop, vage, pop5cat If the officer is white ({\tt orace1TRUE}) is the victim more or less likely to be black? LESS If the victim is young, is the victim more or less likely to be black? MORE > out=predict(mb, type="response") > table(out>.5,D1b$vrace1[complete.cases(D1b)]) FALSE TRUE FALSE 286 77 TRUE 20 94 > (286+94)/(286+94+77+20) [1] 0.7966457 EXTRA library(boot) D1b2=D1b[complete.cases(D1b),] mb2 = glm(vrace1~.,family=binomial(link='logit'),data=D1b2) mycost <- function(r, pi){ c1 = (r==1)&(pi<0.5) #logical vector - true if actual 1 but predict 0 c0 = (r==0)&(pi>=0.5) #logical vector - true if actual 0 but predict 1 return(mean(c1+c0)) } #counts the average number of misses cv.glm(D1b2,mb2,cost=mycost,K=5) $delta [1] 0.2096436 0.2092173 #21% miss rate matches predictions bootit=function(d.f,index){return(coef(glm(vrace1~.,family=binomial,data=d.f,subset=index))[4])} boot(D1b,bootit,1000) Call: boot(data = D1b, statistic = bootit, R = 1000) Bootstrap Statistics : original bias std. error t1* -0.3114552 -0.009965751 0.3641654 matches glm error END EXTRA mh = glm(vrace1~.,family=binomial(link='logit'),data=D1h) summary(mh) glm(formula = vrace1 ~ ., family = binomial(link = "logit"), data = D1h) Deviance Residuals: Min 1Q Median 3Q Max -2.2527 -0.4717 -0.3320 -0.2049 2.6546 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -1.327939 0.971722 -1.367 0.1718 hpop 0.070317 0.008865 7.932 2.15e-15 *** vage -0.032226 0.013561 -2.376 0.0175 * orace1TRUE -0.514286 0.423335 -1.215 0.2244 incQ5 -0.059240 0.239768 -0.247 0.8049 pop5cat -0.062745 0.117941 -0.532 0.5947 crime2 9.545717 15.753012 0.606 0.5445 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 397.09 on 386 degrees of freedom Residual deviance: 246.66 on 380 degrees of freedom (32 observations deleted due to missingness) AIC: 260.66 Number of Fisher Scoring iterations: 5 out=predict(mh, type="response") table(out>.5,D1h$vrace1[complete.cases(D1h)]) FALSE TRUE FALSE 296 37 TRUE 10 44 > (296+44)/(296+44+10+37) [1] 0.878553 always white rate: > (296+10)/(296+44+10+37) [1] 0.7906977