d=read.csv("WHO19.csv")

the file WHO19.csv contains
'data.frame':	193 obs. of  77 variables:
> sum(complete.cases(d))
[1] 4
> names(d)[is.na(d[d$Country=="United States of America",])]
[1] "literacy."                               
[2] "Aid"                                     
[3] "External_debt_per_GNI"                   
[4] "Literacy_rate_adult_female"              
[5] "Literacy_rate_adult_male"                
[6] "Literacy_rate_adult_total"               
[7] "Primary_completion_rate_total"           
[8] "Ratio_of_young_literate_females_to_males"

I'm going to remove the cases with lots of NA

> is=function(i){return(sum(is.na(d[,i])))}
> sapply(1:77,is)
 [1]   0   0   0  16  62  15  14  14   0   0   0  14   0   1  12  85  13   0   0
[20]   0   0   0   0   2   2  58   0   2   2   0   0  13  15  64  19  14  20  36
[39] 111  29  48  15  14  34  40  64  49  64  16  13  17  30  11  64  28  14  33
[58]  62  62  61  24  24  28  38  47  32  14  28  26  28  17  66  13  15  21  13
[77]  13

names(d)[sapply(1:77,is)<20]
d2=d[,sapply(1:77,is)<20]


out1=lm(mortalityFemale~.,data=d2[3:45])
out2=lm(mortalityFemale~fertility.+Population+Births_Skilled.+physicians+mortalityMale+cardiovascular+lifeFemale+lifeMale+Adolescent_fertility_rate ,data=d2[3:45])
out3=lm(mortalityFemale~fertility.+Births_Skilled.+physicians+mortalityMale+cardiovascular+lifeFemale+lifeMale+Adolescent_fertility_rate ,data=d2[3:45])
out4=lm(mortalityFemale~fertility.+Births_Skilled.+mortalityMale+cardiovascular+lifeFemale+lifeMale+cardiovascular*lifeFemale  ,data=d2)
Call:
lm(formula = mortalityFemale ~ fertility. + Births_Skilled. + 
    mortalityMale + cardiovascular + lifeFemale + lifeMale + 
    cardiovascular * lifeFemale, data = d2)

Residuals:
    Min      1Q  Median      3Q     Max 
-243.75  -16.15    0.59   18.21  106.84 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)               228.159648  85.494963   2.669   0.0084 ** 
fertility.                  0.160824   0.082849   1.941   0.0540 .  
Births_Skilled.             0.343812   0.171253   2.008   0.0464 *  
mortalityMale               0.640124   0.047602  13.447  < 2e-16 ***
cardiovascular              0.101558   0.145050   0.700   0.4849    
lifeFemale                -11.724669   1.485791  -7.891 4.56e-13 ***
lifeMale                    8.706121   1.713046   5.082 1.04e-06 ***
cardiovascular:lifeFemale  -0.003842   0.002248  -1.709   0.0894 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 33.33 on 159 degrees of freedom
  (26 observations deleted due to missingness)
Multiple R-squared:  0.9482,	Adjusted R-squared:  0.9459 
F-statistic: 415.8 on 7 and 159 DF,  p-value: < 2.2e-16

reduce mortalityFemale: decrease:fertility.,Births_Skilled.,mortalityMale,cardiovascular,lifeMale,
increase:lifeFemale, cardiovascular:lifeFemale


explain some variable in terms of linear regression of some other variables.
your model must contain at least three significant variables (not counting Intercept)
your model should conatin some interaction term, or show evidence that you tried hard
to find such a term and failed

lmA=lm(CO2_emissions~Broadband_per+Pump_price_for_gasoline*Electric_power_consumption+Urban_pop.,data=d)


---

wine: pick out 4-5 variables which show large variations between classes.
you can do this just looking at boxplots (report what you did; no need for lots of boxplot hardcopies)
but it is quite easy to automate  as was done using the t.test in tree wilt:
aov is exactly designed to determine if some variable deviates significantly between
classes (much as t.test looks a 2 classes); the F statistic being large suggest big differences
and that can be retreived with 
summary(aov(d[,14]~d$class))[[1]][1,4]
where in this case 14 refers to the 14th column: Proline

once you've reduced the number of variables, make a reduced data.frame of just those variables
and do KNN & LDA analysis

library(class)

d=read.csv("wine.csv")
d$class=as.factor(d$class)

nova=function(i){return(summary(aov(d[,i]~d$class))[[1]][1,4])}
q=sapply(2:14,nova)
names(q)=names(d)[2:14]

> sort(q,decreasing=T)
>  sort(q,decreasing=T)
     Flavanoids         Proline     OD280.OD315         Alcohol           Color 
      233.92587       207.92037       189.97232       135.07762       120.66402 
            Hue         Phenols      Malic.acid      Alcalinity Proanthocyanins 
      101.31680        93.73301        36.94342        35.77164        30.27138 
   Nonflavanoid             Ash       Magnesium 
       27.57542        13.31290        12.42958 

d2=d[,names(sort(q,decreasing=T))[1:5]]
d2$class=d$class
'data.frame':	178 obs. of  6 variables:
 $ Flavanoids : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
 $ Proline    : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
 $ OD280.OD315: num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
 $ Alcohol    : num  14.2 13.2 13.2 14.4 13.2 ...
 $ Color      : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
 $ class      : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...

d2s=as.data.frame(scale(d2[,1:5]))
d2s$class=d2$class

library(class)

knn.pred=knn(d2[,1:5],d2[,1:5],d2$class, k=5)
table(knn.pred,d2$class)
   
knn.pred  1  2  3
       1 53  4  2
       2  2 59 15
       3  4  8 31

knn.pred=knn(d2[,1:5],d2[,1:5],d2$class, k=7)
table(knn.pred,d2$class)

knn.pred  1  2  3
       1 54  4  2
       2  1 58 18
       3  4  9 28


knn.pred=knn(d2s[,1:5],d2s[,1:5],d2$class, k=5)
table(knn.pred,d2$class)

knn.pred  1  2  3
       1 59  3  0
       2  0 67  0
       3  0  1 48

knn.pred=knn(d2s[,1:5],d2s[,1:5],d2$class, k=7)
table(knn.pred,d2$class)

knn.pred  1  2  3
       1 59  2  0
       2  0 68  0
       3  0  1 48

BEST
       
library(MASS)
lda1=lda(class~.,data=d2)

> lda1
Call:
lda(class ~ ., data = d2)

Prior probabilities of groups:
        1         2         3 
0.3314607 0.3988764 0.2696629 

Group means:
  Flavanoids   Proline OD280.OD315  Alcohol    Color
1  2.9823729 1115.7119    3.157797 13.74475 5.528305
2  2.0808451  519.5070    2.785352 12.27873 3.086620
3  0.7814583  629.8958    1.683542 13.15375 7.396250

Coefficients of linear discriminants:
                     LD1          LD2
Flavanoids  -1.297131662 -0.615496180
Proline     -0.003369503  0.002611238
OD280.OD315 -0.866242302 -0.009041679
Alcohol     -0.484013672  0.982347076
Color        0.401350482  0.332397481

Proportion of trace:
   LD1    LD2 
0.7235 0.2765 

lda.pred=predict(lda1,d2)
table(lda.pred$class,d2$class)

  
     1  2  3
  1 56  1  0
  2  3 69  0
  3  0  1 48

lda2=lda(class~.,data=d2s)
lda(class ~ ., data = d2s)

Prior probabilities of groups:
        1         2         3 
0.3314607 0.3988764 0.2696629 

Group means:
   Flavanoids    Proline OD280.OD315    Alcohol      Color
1  0.95419225  1.1711967   0.7691811  0.9166093  0.2028288
2  0.05163434 -0.7220731   0.2446043 -0.8892116 -0.8503999
3 -1.24923710 -0.3715295  -1.3072623  0.1886265  1.0085728

Coefficients of linear discriminants:
                   LD1          LD2
Flavanoids  -1.2956512 -0.614793705
Proline     -1.0610815  0.822298462
OD280.OD315 -0.6150237 -0.006419505
Alcohol     -0.3929351  0.797495426
Color        0.9304452  0.770592383

Proportion of trace:
   LD1    LD2 
0.7235 0.2765 

lda.pred=predict(lda2,d2s)
table(lda.pred$class,d2$class)

     1  2  3
  1 56  1  0
  2  3 69  0
  3  0  1 48

exactly the same
  
KNN slightly better

In the stocks directory find files with names: X2000.csv where X is a stock market name
read in the file; find columns
> str(d)
'data.frame':	4821 obs. of  14 variables:
 $ Date     : Factor w/ 4821 levels "2000-01-03","2000-01-04",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ Close    : num  47.2 45.3 46.6 50.4 51.4 ...
 $ Volume   : int  2173400 2713800 3699400 5975800 4101200 3863800 2357600 2868400 2244400 2541800 ...
 $ Today    : num  -0.0371 -0.0414 0.0282 0.0744 0.0195 ...
 $ Lag1     : num  0.0153 -0.0371 -0.0414 0.0282 0.0744 ...
 $ Lag2     : num  -0.0117 0.0153 -0.0371 -0.0414 0.0282 ...
 $ AVolume  : num  2195387 2207000 2263013 2377687 2418267 ...
 $ TodayV   : num  0.53 0.199 0.266 0.381 -0.457 ...
 $ Lag1V    : num  -0.671 0.53 0.199 0.266 0.381 ...
 $ Lag2V    : num  -0.138 -0.671 0.53 0.199 0.266 ...
 $ AClose   : num  47.7 47.5 47.4 47.5 47.6 ...
 $ Lag1BullC: num  1.081 -0.528 -2.209 -0.801 2.881 ...
 $ Lag1BullV: num  -1192340 -21987 506800 1436387 3598113 ...
 $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 1 2 1 1 ...

 Aim: predict the Direction the stock will move during the day based on the varialbes that
 start Lag (these variables deal with price/volume changes on the previous 2 days which could
 be known before the start of the trading day, the ones
 with Bull in the name are more slightly more complex versions)
  
 Use logistic regression to build a model for Direction based m=on vairables the start Lag.
 
 Your final model should incude as many *significant* (here defined as p<.1) factors as possilbe 
 {\tt predict}ing wou model shold produce probabilities for Up; letc call that out
 If you always bought what Up was predicted and sold with Down was predicted your
 run of precentage gains/loss would be:
 d$Today*(out>.5)-d$Todau*(out<.5)
 
 explain exactly how this works including structure/meaning of each of the four terms.
 so your average gain would be
 mean(d$Today*(out>.5)-d$Todau*(out<.5))
 
 regression is likely to result in positive average gain, but given all teh fluctuations is that
 gain really significantly greater that zero in terms of standard deviation of the mean.
 provide the output of a single smaple t-test to resolve this issiue.
 
 BBY2000.csv, Lag2
 CHRW2000.csv, Lag1BullV+Lag1BullC
 ECL2000.csv, Lag1V,Lag2V
 HRL2000.csv, Lag1, Lag1BullV
 MMM2000.csv, Lag1, Lag1V, Lag1BullC
 TGT2000.csv, Lag1, Lag2
 UNH2000.csv, Lag1BullC
 USB2000.csv, Lag1BullC
 XEL2000.csv, Lag1,Lag2,Lag1V

> d=read.csv("MMM2000.csv")
> gl1=glm(Direction ~ Lag1+Lag1BullC, family=binomial, data=d)
> summary(gl1)

Call:
glm(formula = Direction ~ Lag1 + Lag1BullC, family = binomial, 
    data = d)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.462  -1.201   1.073   1.151   1.360  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)   
(Intercept)  0.076665   0.029081   2.636  0.00838 **
Lag1        -4.256257   2.081467  -2.045  0.04087 * 
Lag1BullC   -0.013315   0.007678  -1.734  0.08287 . 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

EG MMM
> gl2=glm(Direction ~ Lag1+Lag1V+Lag1BullV+Lag1BullC,family = binomial,data=d)
> summary(gl2)

Call:
glm(formula = Direction ~ Lag1 + Lag1V + Lag1BullV + Lag1BullC, 
    family = binomial, data = d)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.405  -1.205   1.061   1.147   1.404  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)   
(Intercept)  8.840e-02  2.955e-02   2.992  0.00277 **
Lag1        -4.272e+00  2.086e+00  -2.048  0.04058 * 
Lag1V        1.719e-01  7.959e-02   2.160  0.03079 * 
Lag1BullV   -3.715e-08  2.088e-08  -1.779  0.07520 . 
Lag1BullC   -1.600e-02  7.828e-03  -2.044  0.04092 * 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6677.5  on 4820  degrees of freedom
Residual deviance: 6661.8  on 4816  degrees of freedom
AIC: 6671.8

Number of Fisher Scoring iterations: 3

> out=predict(gl2,type="response")
> str(out>.5)
 Named logi [1:4821] FALSE TRUE TRUE FALSE FALSE FALSE ...
 - attr(*, "names")= chr [1:4821] "1" "2" "3" "4" ...
> str(d$Today)
 num [1:4821] -0.0371 -0.0414 0.0282 0.0744 0.0195 ...


gain=d$Today*(out>.5)-d$Today*(out<.5)
t.test(gain)
> mean(gain)
[1] 0.0003031754
> t.test(gain)

	One Sample t-test

data:  gain
t = 1.4553, df = 4820, p-value = 0.1456
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 -0.0001052322  0.0007115830
sample estimates:
   mean of x 
0.0003031754 

gain=d$Today*(out>.53)-d$Today*(out<.48)
t.test(gain)
> gain=d$Today*(out>.53)-d$Today*(out<.48)
> t.test(gain)

	One Sample t-test

data:  gain
t = 2.6074, df = 4820, p-value = 0.009152
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 9.205411e-05 6.499944e-04
sample estimates:
   mean of x 
0.0003710243 

d$Date=as.Date(d$Date)
plot(d$Date,cumsum(gain),pch=".")


d2=read.csv("new/MMM2020.csv")

d2$Date=as.Date(d2$Date)
gl3=glm(Direction ~ Lag1+Lag1V+Lag1BullV+Lag1BullC,family = binomial,data=d2)
out3=predict(gl3,d2,type="response")
gain=d2$Today*(out3>.53)-d2$Today*(out3<.48)
t.test(gain)

	One Sample t-test

data:  gain
t = 0.14382, df = 178, p-value = 0.8858
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 -0.001523367  0.001762861
sample estimates:
   mean of x 
0.0001197469 


plot(d2$Date,cumsum(gain))

oscillates with very small gain


---
d=read.csv("XEL2000.csv")

gl2=glm(Direction ~ Lag1+Lag1V+Lag2+Lag2V+Lag1BullV+Lag1BullC,family = binomial,data=d)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)  
(Intercept)  7.663e-02  3.048e-02   2.514   0.0119 *
Lag1        -4.204e+00  1.888e+00  -2.227   0.0260 *
Lag1V        1.357e-01  6.503e-02   2.087   0.0369 *
Lag2        -4.476e+00  1.881e+00  -2.380   0.0173 *
Lag2V       -5.233e-03  6.085e-02  -0.086   0.9315  
Lag1BullV   -2.228e-08  2.814e-08  -0.792   0.4285  
Lag1BullC   -1.337e-02  3.149e-02  -0.425   0.6711  


gl3=glm(Direction ~ Lag1+Lag1V+Lag2, family=binomial, data=d)
summary(gl31)
Coefficients:
            Estimate Std. Error z value Pr(>|z|)  
(Intercept)  0.07423    0.02933   2.531   0.0114 *
Lag1        -4.32564    1.83812  -2.353   0.0186 *
Lag1V        0.11982    0.05781   2.073   0.0382 *
Lag2        -4.55790    1.81852  -2.506   0.0122 *

out3=predict(gl3,d,type="response")
t.test(d$Today*(out3>.5)-d$Today*(out3<.5))
t.test(d$Today*(out3>.52)-d$Today*(out3<.48))
t.test(d$Today*(out3>.5)-d$Today*(out3<.48))
t.test(d$Today*(out3>.51)-d$Today*(out3<.48))
t.test(d$Today*(out3>.51)-d$Today*(out3<.47))
quit

gain=d$Today*(out3>.51)-d$Today*(out3<.47)
d$Date=as.Date(d$Date)
plot(d$Date,cumsum(gain),pch="."

dn=read.csv("new/XEL2020.csv")
dn$Date=as.Date(dn$Date)

out4=predict(gl3,dn,type="response")
gain2=dn$Today*(out4>.51)-dn$Today*(out4<.47)
plot(dn$Date,cumsum(gain2))
plot(dn$Date,dn$Close)

---
d=read.csv("BBY2000.csv")

gl2=glm(Direction ~ Lag1+Lag1V+Lag2+Lag2V+Lag1BullV+Lag1BullC,family = binomial,data=d)
Coefficients:
              Estimate Std. Error z value Pr(>|z|)  
(Intercept)  3.483e-02  2.948e-02   1.181   0.2374  
Lag1        -1.156e+00  1.015e+00  -1.139   0.2547  
Lag1V        1.361e-02  2.467e-02   0.552   0.5812  
Lag2        -2.465e+00  1.021e+00  -2.414   0.0158 *
Lag2V        4.674e-02  5.170e-02   0.904   0.3660  
Lag1BullV    1.060e-08  7.179e-09   1.477   0.1398  
Lag1BullC    1.310e-02  1.136e-02   1.153   0.2487  

gl3=glm(Direction ~ Lag2,family = binomial,data=d)
Coefficients:
            Estimate Std. Error z value Pr(>|z|)  
(Intercept)  0.02995    0.02882   1.039   0.2988  
Lag2        -2.23502    0.96507  -2.316   0.0206 *

out3=predict(gl3,d,type="response")
t.test(d$Today*(out3>.5)-d$Today*(out3<.5))
t.test(d$Today*(out3>.52)-d$Today*(out3<.48))
t.test(d$Today*(out3>.5)-d$Today*(out3<.48))
t.test(d$Today*(out3>.51)-d$Today*(out3<.48))
quit

t.test(d$Today*(out3>.51)-d$Today*(out3<.47))

gain=d$Today*(out3>.51)-d$Today*(out3<.48)
d$Date=as.Date(d$Date)
plot(d$Date,cumsum(gain),pch=".")

dn=read.csv("new/BBY2020.csv")
dn$Date=as.Date(dn$Date)

out4=predict(gl3,dn,type="response")
gain2=dn$Today*(out4>.51)-dn$Today*(out4<.48)
plot(dn$Date,cumsum(gain2))
plot(dn$Date,dn$Close)

)

plot
t.test(d$Today*(out3>.52)-d$Today*(out3<.47))

gain=d2$Today*(out3>.53)-d2$Today*(out3<.48)