D=read.csv("Police_Killings_PAR_repB.csv")
str(D)
'data.frame':	1952 obs. of  13 variables:
 $ bpop     : num  8 8 0 2 0 1 1 30 10 46 ...
 $ hpop     : num  11 8 0 4 1 5 30 5 0 1 ...
 $ vgen     : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
 $ vage     : int  33 19 75 33 49 52 31 21 29 53 ...
 $ vrace1   : Factor w/ 4 levels "Asian","Black",..: 3 NA 4 NA 4 4 4 2 4 4 ...
 $ orace1   : Factor w/ 2 levels "Non-White","White": NA NA NA NA NA NA NA 2 NA NA ...
 $ warrant  : int  1 1 1 0 0 1 1 1 0 1 ...
 $ intcause1: int  0 0 1 0 0 0 0 1 0 0 ...
 $ gunnogun : int  1 1 1 1 0 0 NA 0 1 1 ...
 $ incQ5    : int  3 3 2 4 3 4 2 2 2 2 ...
 $ pop5cat  : int  4 1 NA 1 1 1 1 1 1 1 ...
 $ crime2   : num  0.01071 0.0048 NA 0.00411 0.00638 ...
 $ anyweapon: Factor w/ 3 levels "Gun","Other Weapon",..: 1 1 1 1 2 2 NA 2 1 1 ...

 > table(is.na(D$pop5cat),is.na(D$crime2))
       
        FALSE TRUE
  FALSE  1832    3
  TRUE      2  115
  
> table(is.na(D$bpop),is.na(D$incQ5))
       
        FALSE TRUE
  FALSE  1863    0
  TRUE      3   86
  
> table(is.na(D$bpop),is.na(D$hpop))
       
        FALSE TRUE
  FALSE  1862    1
  TRUE      1   88

  > table(is.na(D$orace1),D$vrace1,useNA="ifany")
       
        Asian Black Latino White <NA>
  FALSE    14   178     86   333   14
  TRUE     17   350    276   626   58

  > chisq.test(.Last.value)

	Pearson's Chi-squared test

data:  .Last.value
X-squared = 22.961, df = 4, p-value = 0.0001289 ###

> table(is.na(D$orace1),D$vrace1,useNA="ifany")[,c(2:4)]
       
        Black Latino White
  FALSE   178     86   333
  TRUE    350    276   626
> chisq.test(.Last.value)

	Pearson's Chi-squared test

data:  .Last.value
X-squared = 15.143, df = 2, p-value = 0.000515 ###

t.test(D$incQ5[D$vrace1=="Latino" & is.na(D$orace1)],D$incQ5[D$vrace1=="Latino" & ! is.na(D$orace1)])

	Welch Two Sample t-test

data:  D$incQ5[D$vrace1 == "Latino" & is.na(D$orace1)] and D$incQ5[D$vrace1 == "Latino" & !is.na(D$orace1)]
t = 1.9255, df = 141.94, p-value = 0.05616  ####
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.004582121  0.348585768
sample estimates:
mean of x mean of y 
 2.713178  2.541176 
Q: if a Latino is shot and the officer's race is not reported,
is the location likely to be in a higher income neighborhood than similar deaths where the
officer's race is reported? YES

library(dplyr)

D1=D[!is.na(D$orace1),]
D1$orace1= (D1$orace1=="White")
D1b=D1[D1$vrace1 %in% c("White","Black") ,]
D1b$vrace1= (D1b$vrace1=="Black")
D1h=D1[D1$vrace1 %in% c("White","Latino") ,]
D1h$vrace1= (D1h$vrace1=="Latino")
colnames(D1b)[c(1,4,5,6,10,11,12)]
[1] "bpop"    "vage"    "vrace1"  "orace1"  "incQ5"   "pop5cat" "crime2" 
D1b=D1b[,c(1,4,5,6,10,11,12)]
D1h=D1h[,c(2,4,5,6,10,11,12)]
colnames(D1h)
[1] "hpop"    "vage"    "vrace1"  "orace1"  "incQ5"   "pop5cat" "crime2" 


> table(D1b$orace1,D1b$vrace1)
       
        FALSE TRUE
  FALSE    29   33
  TRUE    304  145
 (Q: report the number of black victims that were killed by black officers. %33
 
 
d=read.csv("pnas.1903856116.sd01B.csv")
sum(d$race=="black")/sum(d$race=="white")/(mean(d$blackPop/d$whitePop))
sum(d$race=="black")/sum(d$race=="white")/(mean(d$blackHom/d$whiteHom))
Q: report the first ratio result. %1.77

hist(d$blackHom/d$whiteHom)
hist(log(d$blackHom/d$whiteHom))

sum(d$race=="black")/sum(d$race=="white")/(median(d$blackHom/d$whiteHom, na.rm=T))

Q: report this ratio. %0.6337


> t.test(d$gini[d$race=="black"],d$gini[d$race=="white"])
t = 8.9797, df = 408.14, p-value < 2.2e-16
mean of x mean of y 
0.4762290 0.4528238 

> t.test(d$income[d$race=="black"],d$income[d$race=="white"])
t = 2.6081, df = 459.35, p-value = 0.009402
mean of x mean of y 
 54374.06  51804.35 SURPRISE!
 
 > t.test(d$popSize[d$race=="black"],d$popSize[d$race=="white"])
t = 4.8042, df = 360.61, p-value = 2.283e-06
mean of x mean of y 
1542286.2  798579.7 

> t.test(d$blackPop[d$race=="black"],d$blackPop[d$race=="white"])
t = 12.234, df = 348.25, p-value < 2.2e-16
mean of x mean of y 
  21.7698    8.9998 

mb =  glm(vrace1~.,family=binomial(link='logit'),data=D1b)
summary(mb)
Call:
glm(formula = vrace1 ~ ., family = binomial(link = "logit"), 
    data = D1b)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8385  -0.7291  -0.4956   0.6876   2.3430  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -0.269342   0.706470  -0.381 0.703017    
bpop         0.055248   0.007676   7.197 6.14e-13 ***
vage        -0.031575   0.009048  -3.490 0.000483 ***
orace1TRUE  -0.311455   0.337155  -0.924 0.355604    
incQ5       -0.201364   0.172367  -1.168 0.242713    
pop5cat      0.234422   0.076579   3.061 0.002205 ** 
crime2       8.644586  11.925747   0.725 0.468533    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 622.53  on 476  degrees of freedom
Residual deviance: 456.11  on 470  degrees of freedom
  (34 observations deleted due to missingness)
AIC: 470.11

Qs: Which variables are significant? bpop, vage, pop5cat
If the officer is white ({\tt orace1TRUE}) is the victim
more or less likely to be black? LESS
If the victim is young, is the victim more or less likely to be black? MORE

> out=predict(mb, type="response")
> table(out>.5,D1b$vrace1[complete.cases(D1b)])
       
        FALSE TRUE
  FALSE   286   77
  TRUE     20   94
> (286+94)/(286+94+77+20)
[1] 0.7966457

EXTRA
library(boot)

D1b2=D1b[complete.cases(D1b),]
mb2 =  glm(vrace1~.,family=binomial(link='logit'),data=D1b2)
mycost <- function(r, pi){
    c1 = (r==1)&(pi<0.5) #logical vector - true if actual 1 but predict 0
    c0 = (r==0)&(pi>=0.5) #logical vector - true if actual 0 but predict 1
    return(mean(c1+c0))
  }
#counts the average number of misses 
cv.glm(D1b2,mb2,cost=mycost,K=5)

$delta
[1] 0.2096436 0.2092173
#21% miss rate matches predictions

bootit=function(d.f,index){return(coef(glm(vrace1~.,family=binomial,data=d.f,subset=index))[4])}
boot(D1b,bootit,1000)
Call:
boot(data = D1b, statistic = bootit, R = 1000)


Bootstrap Statistics :
      original       bias    std. error
t1* -0.3114552 -0.009965751   0.3641654

matches glm error

END EXTRA

mh =  glm(vrace1~.,family=binomial(link='logit'),data=D1h)
summary(mh)
glm(formula = vrace1 ~ ., family = binomial(link = "logit"), 
    data = D1h)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2527  -0.4717  -0.3320  -0.2049   2.6546  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.327939   0.971722  -1.367   0.1718    
hpop         0.070317   0.008865   7.932 2.15e-15 ***
vage        -0.032226   0.013561  -2.376   0.0175 *  
orace1TRUE  -0.514286   0.423335  -1.215   0.2244    
incQ5       -0.059240   0.239768  -0.247   0.8049    
pop5cat     -0.062745   0.117941  -0.532   0.5947    
crime2       9.545717  15.753012   0.606   0.5445    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 397.09  on 386  degrees of freedom
Residual deviance: 246.66  on 380  degrees of freedom
  (32 observations deleted due to missingness)
AIC: 260.66

Number of Fisher Scoring iterations: 5

out=predict(mh, type="response")
table(out>.5,D1h$vrace1[complete.cases(D1h)])
       FALSE TRUE
  FALSE   296   37
  TRUE     10   44
> (296+44)/(296+44+10+37)
[1] 0.878553

always white rate:
> (296+10)/(296+44+10+37)
[1] 0.7906977