R Markdown

This is an R Markdown document. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

The syntax on line 19 then the subsequent on line 56 tells R Markdown to run R code without the asterisks embeds the subsequent code as embedded inside the R Markdown file

library(ggplot2)
library(plyr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.3.3
setwd("C:/Users/jlariv/OneDrive/Econ 404/")
oj <- read.csv("oj.csv") 

## create some colors for the brands
levels(oj$brand)
## [1] "dominicks"   "minute.maid" "tropicana"
#brandcol <- c("green","red","gold")
par(mfrow=c(1,2))
ggplot(oj, aes(factor(brand), price)) + geom_boxplot(aes(fill = factor(brand)))

ggplot(oj, aes(factor(brand), log(price))) + geom_boxplot(aes(fill = factor(brand)))

ggplot(oj, aes(logmove, price)) + geom_point(aes(color = factor(brand)))

ggplot(oj, aes(price, logmove)) + geom_point(aes(color = factor(brand)))

ggplot(oj, aes(feat, brand)) + geom_point(position = "jitter", aes(color = factor(brand)))

#using aggregate from the base package; 5 and 6 are the location in the column of the dataframe
aggregate(oj[, 5:6], list(oj$brand), mean)
##       Group.1      feat    price
## 1   dominicks 0.2570215 1.735809
## 2 minute.maid 0.2885273 2.241162
## 3   tropicana 0.1662348 2.870493
#using ddplyr from the plyr package
ddply(oj, .(brand), summarize,  feat=mean(feat),price=mean(price))
##         brand      feat    price
## 1   dominicks 0.2570215 1.735809
## 2 minute.maid 0.2885273 2.241162
## 3   tropicana 0.1662348 2.870493
#use plyr to look at mean and SD of price by brand-featured combination
ddply(oj, .(brand,feat), summarize,  mean_price=mean(price),sd_price=sd(price), obs=length(price))
## Warning: package 'bindrcpp' was built under R version 3.3.3
##         brand feat mean_price  sd_price  obs
## 1   dominicks    0   1.795460 0.3823584 7169
## 2   dominicks    1   1.563375 0.3415036 2480
## 3 minute.maid    0   2.328877 0.4080230 6865
## 4 minute.maid    1   2.024867 0.3014639 2784
## 5   tropicana    0   2.966131 0.5132730 8045
## 6   tropicana    1   2.390817 0.4614941 1604
## simple regression
reg = glm(logmove ~ log(price)*feat + brand, data=oj)
summary(reg)
## 
## Call:
## glm(formula = logmove ~ log(price) * feat + brand, data = oj)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.6036  -0.4410  -0.0118   0.4183   3.2952  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.12520    0.01580  640.68   <2e-16 ***
## log(price)       -2.34447    0.02292 -102.29   <2e-16 ***
## feat              1.48028    0.02719   54.45   <2e-16 ***
## brandminute.maid  0.69440    0.01167   59.49   <2e-16 ***
## brandtropicana    1.29026    0.01470   87.78   <2e-16 ***
## log(price):feat  -0.87757    0.03741  -23.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.4942208)
## 
##     Null deviance: 30079  on 28946  degrees of freedom
## Residual deviance: 14303  on 28941  degrees of freedom
## AIC: 61755
## 
## Number of Fisher Scoring iterations: 2
reg1 = glm(logmove ~ log(price)*brand*feat, data=oj)
summary(reg1)
## 
## Call:
## glm(formula = logmove ~ log(price) * brand * feat, data = oj)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.8893  -0.4290  -0.0091   0.4125   3.2368  
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      10.40658    0.02335 445.668  < 2e-16 ***
## log(price)                       -2.77415    0.03883 -71.445  < 2e-16 ***
## brandminute.maid                  0.04720    0.04663   1.012    0.311    
## brandtropicana                    0.70794    0.05080  13.937  < 2e-16 ***
## feat                              1.09441    0.03810  28.721  < 2e-16 ***
## log(price):brandminute.maid       0.78293    0.06140  12.750  < 2e-16 ***
## log(price):brandtropicana         0.73579    0.05684  12.946  < 2e-16 ***
## log(price):feat                  -0.47055    0.07409  -6.351 2.17e-10 ***
## brandminute.maid:feat             1.17294    0.08196  14.312  < 2e-16 ***
## brandtropicana:feat               0.78525    0.09875   7.952 1.90e-15 ***
## log(price):brandminute.maid:feat -1.10922    0.12225  -9.074  < 2e-16 ***
## log(price):brandtropicana:feat   -0.98614    0.12411  -7.946 2.00e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.4829706)
## 
##     Null deviance: 30079  on 28946  degrees of freedom
## Residual deviance: 13975  on 28935  degrees of freedom
## AIC: 61094
## 
## Number of Fisher Scoring iterations: 2
reg1 = lm(logmove ~ log(price)*brand*feat + AGE60 + EDUC + ETHNIC + INCOME+ HHLARGE + WORKWOM + HVAL150 + SSTRDIST
           + SSTRVOL + CPDIST5 + CPWVOL5, data=oj)
summary(reg1)
## 
## Call:
## lm(formula = logmove ~ log(price) * brand * feat + AGE60 + EDUC + 
##     ETHNIC + INCOME + HHLARGE + WORKWOM + HVAL150 + SSTRDIST + 
##     SSTRVOL + CPDIST5 + CPWVOL5, data = oj)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2004 -0.3922 -0.0049  0.3786  2.9884 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      13.143435   0.329622  39.874  < 2e-16 ***
## log(price)                       -2.854302   0.036816 -77.528  < 2e-16 ***
## brandminute.maid                  0.092348   0.044158   2.091  0.03651 *  
## brandtropicana                    0.772280   0.048102  16.055  < 2e-16 ***
## feat                              1.055010   0.036034  29.278  < 2e-16 ***
## AGE60                             2.044239   0.127506  16.032  < 2e-16 ***
## EDUC                              0.955931   0.101486   9.419  < 2e-16 ***
## ETHNIC                            0.643102   0.037229  17.274  < 2e-16 ***
## INCOME                           -0.279937   0.033300  -8.406  < 2e-16 ***
## HHLARGE                          -0.920092   0.230509  -3.992 6.58e-05 ***
## WORKWOM                          -0.424559   0.146824  -2.892  0.00384 ** 
## HVAL150                           0.357561   0.041803   8.554  < 2e-16 ***
## SSTRDIST                         -0.019979   0.001465 -13.634  < 2e-16 ***
## SSTRVOL                          -0.055948   0.009768  -5.728 1.03e-08 ***
## CPDIST5                           0.064160   0.006289  10.203  < 2e-16 ***
## CPWVOL5                          -0.500272   0.025703 -19.464  < 2e-16 ***
## log(price):brandminute.maid       0.753984   0.058086  12.981  < 2e-16 ***
## log(price):brandtropicana         0.713983   0.053736  13.287  < 2e-16 ***
## log(price):feat                  -0.405049   0.070053  -5.782 7.45e-09 ***
## brandminute.maid:feat             1.131698   0.077495  14.603  < 2e-16 ***
## brandtropicana:feat               0.746656   0.093353   7.998 1.31e-15 ***
## log(price):brandminute.maid:feat -1.079118   0.115559  -9.338  < 2e-16 ***
## log(price):brandtropicana:feat   -0.989109   0.117302  -8.432  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6568 on 28924 degrees of freedom
## Multiple R-squared:  0.5851, Adjusted R-squared:  0.5848 
## F-statistic:  1854 on 22 and 28924 DF,  p-value: < 2.2e-16
reg2 = lm(logmove ~ log(price)*brand*feat + log(price)*HHLARGE + feat*HHLARGE + log(price)*EDUC + log(price)*INCOME + AGE60 + EDUC + ETHNIC + HHLARGE + WORKWOM + HVAL150 + SSTRDIST
          + SSTRVOL + CPDIST5 + CPWVOL5, data=oj)
summary(reg2)
## 
## Call:
## lm(formula = logmove ~ log(price) * brand * feat + log(price) * 
##     HHLARGE + feat * HHLARGE + log(price) * EDUC + log(price) * 
##     INCOME + AGE60 + EDUC + ETHNIC + HHLARGE + WORKWOM + HVAL150 + 
##     SSTRDIST + SSTRVOL + CPDIST5 + CPWVOL5, data = oj)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5575 -0.3758 -0.0060  0.3632  2.9126 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      17.743378   0.588134  30.169  < 2e-16 ***
## log(price)                       -8.836571   0.611508 -14.450  < 2e-16 ***
## brandminute.maid                  0.096972   0.043090   2.250   0.0244 *  
## brandtropicana                    0.805184   0.047040  17.117  < 2e-16 ***
## feat                              1.028299   0.050330  20.431  < 2e-16 ***
## HHLARGE                           4.243096   0.472819   8.974  < 2e-16 ***
## EDUC                             -0.997972   0.170358  -5.858 4.73e-09 ***
## INCOME                           -0.731403   0.058832 -12.432  < 2e-16 ***
## AGE60                             2.039271   0.124414  16.391  < 2e-16 ***
## ETHNIC                            0.665650   0.036344  18.315  < 2e-16 ***
## WORKWOM                          -0.346427   0.143286  -2.418   0.0156 *  
## HVAL150                           0.383774   0.040794   9.408  < 2e-16 ***
## SSTRDIST                         -0.019730   0.001430 -13.796  < 2e-16 ***
## SSTRVOL                          -0.051052   0.009531  -5.356 8.57e-08 ***
## CPDIST5                           0.060879   0.006137   9.919  < 2e-16 ***
## CPWVOL5                          -0.502568   0.025080 -20.038  < 2e-16 ***
## log(price):brandminute.maid       0.749394   0.056679  13.222  < 2e-16 ***
## log(price):brandtropicana         0.686462   0.052492  13.077  < 2e-16 ***
## log(price):feat                  -0.392707   0.068368  -5.744 9.34e-09 ***
## brandminute.maid:feat             1.123165   0.075615  14.854  < 2e-16 ***
## brandtropicana:feat               0.734101   0.091122   8.056 8.17e-16 ***
## log(price):HHLARGE               -6.543458   0.497739 -13.146  < 2e-16 ***
## feat:HHLARGE                      0.188415   0.307068   0.614   0.5395    
## log(price):EDUC                   2.375998   0.174291  13.632  < 2e-16 ***
## log(price):INCOME                 0.583554   0.061208   9.534  < 2e-16 ***
## log(price):brandminute.maid:feat -1.071241   0.112753  -9.501  < 2e-16 ***
## log(price):brandtropicana:feat   -0.988814   0.114465  -8.639  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6408 on 28920 degrees of freedom
## Multiple R-squared:  0.6051, Adjusted R-squared:  0.6048 
## F-statistic:  1705 on 26 and 28920 DF,  p-value: < 2.2e-16
#Predict command & fair r^2
oj$reg2_hat <- predict(reg2, data =oj)
cor(oj$logmove,oj$reg2_hat)
## [1] 0.7779064
#2.b.: How much does education and large HH size matter?
exp(coef(reg2)["HHLARGE"]*(summary(oj$HHLARGE)["3rd Qu."]-summary(oj$HHLARGE)["Median"]))
## HHLARGE 
##  1.1072
exp(coef(reg2)["EDUC"]*(summary(oj$EDUC)["Max."]-summary(oj$EDUC)["Median"]))
##      EDUC 
## 0.7420093
#Which model fists better out of sample?
set.seed(1)
sample_percent <- .8
N <- length(oj$logmove) 
sample_size <- round(sample_percent*N)
oj_subset <- sample_n(oj,sample_size,replace = FALSE, weight = NULL, .env = NULL)
#NOTE: We can do the above three lines in a single line.  Can you think of how?  
test = anti_join(oj,oj_subset)
## Joining, by = c("store", "brand", "week", "logmove", "feat", "price", "AGE60", "EDUC", "ETHNIC", "INCOME", "HHLARGE", "WORKWOM", "HVAL150", "SSTRDIST", "SSTRVOL", "CPDIST5", "CPWVOL5", "reg2_hat")
#Alternative method so that we don't have to keep track of what we didn't sample.

reg3 = glm(logmove ~ log(price)*brand*feat, data=oj_subset)
oj_subset$reg3_hat <- predict(reg3)
MSE_train_reg3 <- mean((oj_subset$reg3_hat - oj_subset$logmove)^2)
MSE_train_reg3
## [1] 0.479234
# MSE for the training set for this model is .479

#Get predicted values for the test set.
test$y_hat_test <- predict(reg3,test)
MSE_test_reg3 <- mean((test$y_hat_test - test$logmove)^2)
MSE_test_reg3
## [1] 0.4972704
# We can see the test set MSE is .497 and the training set is .479.  Its to be expected that the test set MSE is greater than the training set MSE, as shown in the figure from class. 

#Now lets try with a slightly more complex model
reg4= glm(logmove ~ log(price)*brand*feat + log(price)*HHLARGE + feat*HHLARGE + log(price)*EDUC + log(price)*INCOME + AGE60 + EDUC + ETHNIC + HHLARGE + WORKWOM + HVAL150 + SSTRDIST + SSTRVOL + CPDIST5 + CPWVOL5, data=oj_subset)
oj_subset$reg4_hat <- predict(reg4)
MSE_train_reg4 <- mean((oj_subset$reg4_hat - oj_subset$logmove)^2)
MSE_train_reg4
## [1] 0.4066122
# MSE for the training set for this model is .407

#Get predicted values for the test set.
test$y_hat_test <- predict(reg4,test)
MSE_test_reg4 <- mean((test$y_hat_test - test$logmove)^2)
MSE_test_reg4
## [1] 0.425828
#The model 4 has a lower out of sample MSE than the simpler model 3: .426 versus .497



#Let's see how many weeks we have:
unique(oj$week)
##   [1]  40  46  47  48  50  51  52  53  54  57  58  59  60  61  62  63  64
##  [18]  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81
##  [35]  82  83  84  85  86  87  88  89  90  91  92  93  94  95  97  98  99
##  [52] 100 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
##  [69] 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
##  [86] 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
## [103] 153 154 155 156 157 158 159 160  42  43  44  49  55  56  96 101 102
## [120]  41  45
#Create a new df which reorders the old one. NOTE: We could easily just call this oj rather than df and avoid the redundancy
df=oj[order(oj$week,oj$store),]
#Investigating this you'll see some stores have missing weeks.
#The reordered data is summarized differently 
unique(df$week)
##   [1]  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56
##  [18]  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73
##  [35]  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##  [52]  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
##  [69] 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
##  [86] 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
## [103] 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
## [120] 159 160
#First why to add in lagged weeks
df1 <-oj
df1$week<-df1$week+1  # df1 now has NEXT week and not the current one.  If we merge this by weeks now, this is last week's price (e.g., "lagged price").
myvars <- c("price", "week", "brand","store")
df1 <- df1[myvars]
lagged <- merge(oj, df1, by=c("brand","store","week"))
#NOTE: The number of observations decreased.  Why?  You've just lost (at least) one week's worth of data at each store.
lagged=lagged[order(lagged$week,lagged$store),]
lagged=lagged[order(lagged$store,lagged$week),]
#Comparing this to above, Store two is nowhere to be found.  There was missing data for consecutive weeks.  As a result, it gets dropped.
colnames(lagged)[18] <- "lagged_price"
colnames(lagged)[6] <- "price"

reglag = glm(logmove ~ log(price)*brand*feat + log(lagged_price)*brand, data=lagged)
summary(reglag)
## 
## Call:
## glm(formula = logmove ~ log(price) * brand * feat + log(lagged_price) * 
##     brand, data = lagged)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.2850  -0.3764  -0.0116   0.3582   2.9140  
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)                        -7.817991   0.529156 -14.774  < 2e-16
## log(price)                         -0.354609   0.079164  -4.479 7.51e-06
## brandminute.maid                   -0.023613   0.786943  -0.030 0.976062
## brandtropicana                     -7.773151   0.730465 -10.641  < 2e-16
## feat                                0.392746   0.041201   9.533  < 2e-16
## log(lagged_price)                   7.745386   0.224636  34.480  < 2e-16
## log(price):brandminute.maid         0.113267   0.106588   1.063 0.287944
## log(price):brandtropicana           0.766505   0.099682   7.689 1.53e-14
## log(price):feat                    -0.385704   0.068873  -5.600 2.16e-08
## brandminute.maid:feat               0.321076   0.093527   3.433 0.000598
## brandtropicana:feat                -0.422363   0.101278  -4.170 3.05e-05
## brandminute.maid:log(lagged_price)  0.003689   0.333247   0.011 0.991168
## brandtropicana:log(lagged_price)    3.259095   0.305186  10.679  < 2e-16
## log(price):brandminute.maid:feat   -0.298117   0.117503  -2.537 0.011183
## log(price):brandtropicana:feat      0.335400   0.119009   2.818 0.004832
##                                       
## (Intercept)                        ***
## log(price)                         ***
## brandminute.maid                      
## brandtropicana                     ***
## feat                               ***
## log(lagged_price)                  ***
## log(price):brandminute.maid           
## log(price):brandtropicana          ***
## log(price):feat                    ***
## brandminute.maid:feat              ***
## brandtropicana:feat                ***
## brandminute.maid:log(lagged_price)    
## brandtropicana:log(lagged_price)   ***
## log(price):brandminute.maid:feat   *  
## log(price):brandtropicana:feat     ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.4073141)
## 
##     Null deviance: 29152  on 28007  degrees of freedom
## Residual deviance: 11402  on 27993  degrees of freedom
## AIC: 54344
## 
## Number of Fisher Scoring iterations: 2
reglag2 = glm(logmove ~ log(price)*brand*feat + log(lagged_price)*brand +AGE60 + EDUC + ETHNIC + INCOME+ HHLARGE + WORKWOM + HVAL150 + SSTRDIST
           + SSTRVOL + CPDIST5 + CPWVOL5, data=lagged)
summary(reglag2)
## 
## Call:
## glm(formula = logmove ~ log(price) * brand * feat + log(lagged_price) * 
##     brand + AGE60 + EDUC + ETHNIC + INCOME + HHLARGE + WORKWOM + 
##     HVAL150 + SSTRDIST + SSTRVOL + CPDIST5 + CPWVOL5, data = lagged)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.2892  -0.3778  -0.0087   0.3605   2.9164  
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)                        -5.263499   0.846520  -6.218 5.11e-10
## log(price)                         -0.610808   0.103296  -5.913 3.39e-09
## brandminute.maid                   -0.550193   0.811592  -0.678 0.497829
## brandtropicana                     -9.321690   0.780484 -11.943  < 2e-16
## feat                                0.464101   0.044934  10.328  < 2e-16
## log(lagged_price)                   6.947686   0.303715  22.876  < 2e-16
## AGE60                               0.153435   0.137274   1.118 0.263694
## EDUC                                0.005954   0.103908   0.057 0.954303
## ETHNIC                              0.043431   0.040606   1.070 0.284831
## INCOME                             -0.067636   0.033730  -2.005 0.044953
## HHLARGE                             0.301900   0.230074   1.312 0.189468
## WORKWOM                            -0.070105   0.145379  -0.482 0.629651
## HVAL150                             0.040713   0.042294   0.963 0.335742
## SSTRDIST                           -0.001526   0.001551  -0.983 0.325439
## SSTRVOL                             0.001051   0.009784   0.107 0.914412
## CPDIST5                             0.006086   0.006453   0.943 0.345610
## CPWVOL5                            -0.046057   0.028831  -1.597 0.110172
## log(price):brandminute.maid         0.232688   0.108512   2.144 0.032014
## log(price):brandtropicana           0.985234   0.106123   9.284  < 2e-16
## log(price):feat                    -0.384121   0.068868  -5.578 2.46e-08
## brandminute.maid:feat               0.358305   0.098656   3.632 0.000282
## brandtropicana:feat                -0.476780   0.103136  -4.623 3.80e-06
## brandminute.maid:log(lagged_price)  0.230485   0.343489   0.671 0.502219
## brandtropicana:log(lagged_price)    3.923173   0.326194  12.027  < 2e-16
## log(price):brandminute.maid:feat   -0.358400   0.120602  -2.972 0.002963
## log(price):brandtropicana:feat      0.322107   0.121341   2.655 0.007946
##                                       
## (Intercept)                        ***
## log(price)                         ***
## brandminute.maid                      
## brandtropicana                     ***
## feat                               ***
## log(lagged_price)                  ***
## AGE60                                 
## EDUC                                  
## ETHNIC                                
## INCOME                             *  
## HHLARGE                               
## WORKWOM                               
## HVAL150                               
## SSTRDIST                              
## SSTRVOL                               
## CPDIST5                               
## CPWVOL5                               
## log(price):brandminute.maid        *  
## log(price):brandtropicana          ***
## log(price):feat                    ***
## brandminute.maid:feat              ***
## brandtropicana:feat                ***
## brandminute.maid:log(lagged_price)    
## brandtropicana:log(lagged_price)   ***
## log(price):brandminute.maid:feat   ** 
## log(price):brandtropicana:feat     ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.4069148)
## 
##     Null deviance: 29152  on 28007  degrees of freedom
## Residual deviance: 11386  on 27982  degrees of freedom
## AIC: 54328
## 
## Number of Fisher Scoring iterations: 2
# You would expect to see decreases in lagged prices decrease this weeks sales (e.g., substitutes).  That is exactly what we find.  The effect is largest for Dominicks, medium for Minute Maid and smallest for Tropicana.  Consistent with current period elasticity estimates.  


# There ought to be a way to figure this out with panel data but I couldn't figure it out.
# pot_dates<-data.frame(seq(as.Date("1993/3/1"), as.Date("1997/3/1"), "1 week"))
# colnames(pot_dates)[1] <- "faux_date"
# pot_dates$week = 1:nrow(pot_dates)
# oj_dates <- merge(oj, pot_dates, by=c("week"))


library(reshape2)
#dcast is a function in the reshape2 library that turns "long data" into "wide data""
oj_prices <-oj[,1:6]
oj_wide <- dcast(oj_prices, store + week ~ brand)
## Using price as value column: use value.var to override.
colnames(oj_wide)[3] <- "P_Dom"
colnames(oj_wide)[4] <- "P_MM"
colnames(oj_wide)[5] <- "P_Trop"
oj_cross <- merge(oj, oj_wide, by=c("week","store"))

#Merge the wide data back in then only look at the cross price elasticity matrix for tropicana.
trop_cross <- subset(oj_cross, brand=="tropicana") 

regcross = glm(logmove ~ log(P_Dom)*feat+log(P_MM)*feat+log(P_Trop)*feat, data=trop_cross)
summary(regcross)
## 
## Call:
## glm(formula = logmove ~ log(P_Dom) * feat + log(P_MM) * feat + 
##     log(P_Trop) * feat, data = trop_cross)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3191  -0.3801   0.0011   0.3661   2.6433  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.92239    0.04339 251.738  < 2e-16 ***
## log(P_Dom)        0.14741    0.02916   5.056 4.36e-07 ***
## feat              1.50603    0.09893  15.224  < 2e-16 ***
## log(P_MM)         0.29909    0.03741   7.995 1.44e-15 ***
## log(P_Trop)      -2.15199    0.03728 -57.721  < 2e-16 ***
## log(P_Dom):feat   0.13039    0.10214   1.277    0.202    
## feat:log(P_MM)    0.68484    0.11157   6.138 8.67e-10 ***
## feat:log(P_Trop) -1.76597    0.09519 -18.553  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3535176)
## 
##     Null deviance: 6927.8  on 9648  degrees of freedom
## Residual deviance: 3408.3  on 9641  degrees of freedom
## AIC: 17359
## 
## Number of Fisher Scoring iterations: 2
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.3.3
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
###################
# In Class
###################
x <- as.matrix(oj_cross[ ,5:17])
y <- as.numeric(as.matrix(oj_cross[ ,4]))
set.seed(720)
#lasso_v1 <- cv.glmnet(x, y, alpha=1)
lasso_v1 <- glmnet(x, y, alpha=1)

#Results
plot(lasso_v1)
coef(lasso_v1, s=lasso_v1$lambda.min)
## 14 x 71 sparse Matrix of class "dgCMatrix"
##    [[ suppressing 71 column names 's0', 's1', 's2' ... ]]
##                                                                          
## (Intercept) 9.167864 9.1407774 9.116097 9.0936094  9.12952205  9.20635703
## feat        .        0.1141635 0.218185 0.3129655  0.38883467  0.44975799
## price       .        .         .        .         -0.02362047 -0.06361618
## AGE60       .        .         .        .          .           .         
## EDUC        .        .         .        .          .           .         
## ETHNIC      .        .         .        .          .           .         
## INCOME      .        .         .        .          .           .         
## HHLARGE     .        .         .        .          .           .         
## WORKWOM     .        .         .        .          .           .         
## HVAL150     .        .         .        .          .           .         
## SSTRDIST    .        .         .        .          .           .         
## SSTRVOL     .        .         .        .          .           .         
## CPDIST5     .        .         .        .          .           .         
## CPWVOL5     .        .         .        .          .           .         
##                                                                   
## (Intercept)  9.2763665  9.3401565  9.3982796  9.4512392  9.4994940
## feat         0.5052686  0.5558478  0.6019338  0.6439255  0.6821868
## price       -0.1000589 -0.1332641 -0.1635194 -0.1910870 -0.2162055
## AGE60        .          .          .          .          .        
## EDUC         .          .          .          .          .        
## ETHNIC       .          .          .          .          .        
## INCOME       .          .          .          .          .        
## HHLARGE      .          .          .          .          .        
## WORKWOM      .          .          .          .          .        
## HVAL150      .          .          .          .          .        
## SSTRDIST     .          .          .          .          .        
## SSTRVOL      .          .          .          .          .        
## CPDIST5      .          .          .          .          .        
## CPWVOL5      .          .          .          .          .        
##                                                                   
## (Intercept)  9.5434620  9.5835205  9.6200238  9.6532842  9.6835899
## feat         0.7170491  0.7488196  0.7777624  0.8041340  0.8281629
## price       -0.2390926 -0.2599454 -0.2789468 -0.2962601 -0.3120353
## AGE60        .          .          .          .          .        
## EDUC         .          .          .          .          .        
## ETHNIC       .          .          .          .          .        
## INCOME       .          .          .          .          .        
## HHLARGE      .          .          .          .          .        
## WORKWOM      .          .          .          .          .        
## HVAL150      .          .          .          .          .        
## SSTRDIST     .          .          .          .          .        
## SSTRVOL      .          .          .          .          .        
## CPDIST5      .          .          .          .          .        
## CPWVOL5      .          .          .          .          .        
##                                                                        
## (Intercept)  9.7116301906  9.75944331  9.80300028  9.8215186  9.8354702
## feat         0.8500570613  0.86989873  0.88797889  0.9042502  0.9190135
## price       -0.3264091101 -0.33972311 -0.35185073 -0.3634013 -0.3740018
## AGE60        .             .           .           0.1215038  0.2491150
## EDUC         .             .           .           .          .        
## ETHNIC       .             .           .           .          .        
## INCOME       .             .           .           .          .        
## HHLARGE      .             .           .           .          .        
## WORKWOM      .             .           .           .          .        
## HVAL150      .             .           .           .          .        
## SSTRDIST     .             .           .           .          .        
## SSTRVOL      .             .           .           .          .        
## CPDIST5      .             .           .           .          .        
## CPWVOL5     -0.0009725857 -0.05139639 -0.09734056 -0.1361884 -0.1711660
##                                                                        
## (Intercept)  9.8481742  9.8597496  9.8926216  9.9358509511  9.977329881
## feat         0.9324797  0.9447496  0.9558200  0.9658629881  0.975083229
## price       -0.3836578 -0.3924560 -0.4006704 -0.4082413017 -0.414983648
## AGE60        0.3653810  0.4713183  0.5449372  0.5985727539  0.648642607
## EDUC         .          .          .          .             .          
## ETHNIC       .          .          .          .             .          
## INCOME       .          .          .          .             .          
## HHLARGE      .          .         -0.1379716 -0.3417576858 -0.515918371
## WORKWOM      .          .          .          .             .          
## HVAL150      .          .          .          .             .          
## SSTRDIST     .          .          .          .             .          
## SSTRVOL      .          .          .         -0.0007441375 -0.006413966
## CPDIST5      .          .          .          .             .          
## CPWVOL5     -0.2030364 -0.2320755 -0.2629359 -0.2929214209 -0.315630024
##                                                                        
## (Intercept) 10.01164171 10.04170811 10.04221459 10.03931274 10.03720667
## feat         0.98334800  0.99084183  0.99742551  1.00336882  1.00879857
## price       -0.42139775 -0.42731450 -0.43327425 -0.43878987 -0.44379678
## AGE60        0.70163225  0.75245217  0.82714206  0.89974537  0.96497065
## EDUC         .           .           .           .           .         
## ETHNIC       0.03116777  0.06860358  0.10979578  0.14798211  0.18294975
## INCOME       .           .           .           .           .         
## HHLARGE     -0.69795856 -0.86901831 -0.92618184 -0.96512216 -1.00288450
## WORKWOM      .           .           .           .           .         
## HVAL150      .           .           0.02673622  0.05382243  0.07816756
## SSTRDIST     .           .           .           .           .         
## SSTRVOL     -0.01585384 -0.02581739 -0.03179663 -0.03691159 -0.04169514
## CPDIST5      .           .           .           .           .         
## CPWVOL5     -0.32295238 -0.32558897 -0.33280724 -0.33978634 -0.34597034
##                                                               
## (Intercept) 10.0353005 10.0437034492 10.064166919 10.083801832
## feat         1.0137463  1.0183511861  1.022742778  1.026776897
## price       -0.4483585 -0.4523246629 -0.455513204 -0.458347108
## AGE60        1.0243762  1.0734328079  1.126806701  1.177919246
## EDUC         .          .             0.051974503  0.119102017
## ETHNIC       0.2148065  0.2487941173  0.291697462  0.329560442
## INCOME       .          .             .            .          
## HHLARGE     -1.0373487 -1.0807822367 -1.132383106 -1.187142120
## WORKWOM      .         -0.0161564240 -0.067571986 -0.118901036
## HVAL150      0.1003453  0.1224201350  0.127415242  0.122998180
## SSTRDIST     .         -0.0007065496 -0.002895866 -0.004848582
## SSTRVOL     -0.0460532 -0.0491381437 -0.050495713 -0.052303833
## CPDIST5      .          .             .            .          
## CPWVOL5     -0.3516077 -0.3599499437 -0.374380756 -0.387047515
##                                                                
## (Intercept) 10.1048357402 10.107414197 10.109998008 10.11318145
## feat         1.0304779956  1.033907621  1.037043693  1.03990129
## price       -0.4608910560 -0.463069260 -0.465036948 -0.46683108
## AGE60        1.2163896798  1.253119278  1.287695578  1.31771679
## EDUC         0.1777517994  0.219494474  0.262880660  0.30124312
## ETHNIC       0.3644783142  0.404563894  0.440728221  0.47355469
## INCOME       .             .            .            .         
## HHLARGE     -1.2485270060 -1.310277270 -1.368176774 -1.42214923
## WORKWOM     -0.1716958101 -0.207781239 -0.241612567 -0.27341337
## HVAL150      0.1202011143  0.127932790  0.132331122  0.13679786
## SSTRDIST    -0.0066009820 -0.008318780 -0.009864775 -0.01127094
## SSTRVOL     -0.0540505291 -0.054519593 -0.055192131 -0.05578519
## CPDIST5      0.0007416167  0.006249921  0.011213221  0.01574012
## CPWVOL5     -0.3982510875 -0.410398804 -0.421149737 -0.43094516
##                                                                        
## (Intercept) 10.11606610 10.11905346 10.12172187 10.12370362 10.12576796
## feat         1.04250653  1.04487948  1.04704117  1.04901126  1.05080754
## price       -0.46846122 -0.46995025 -0.47130894 -0.47254337 -0.47366528
## AGE60        1.34555698  1.36994312  1.39202154  1.41330299  1.43253674
## EDUC         0.33800193  0.36967910  0.39768354  0.42521830  0.45115558
## ETHNIC       0.50342990  0.53059883  0.55533582  0.57799835  0.59856799
## INCOME       .           .           .           .           .         
## HHLARGE     -1.47170391 -1.51709151 -1.55815414 -1.59521458 -1.62964311
## WORKWOM     -0.30265360 -0.32948817 -0.35368981 -0.37550423 -0.39586356
## HVAL150      0.14002905  0.14377834  0.14758724  0.15017906  0.15212039
## SSTRDIST    -0.01254836 -0.01371402 -0.01477768 -0.01574560 -0.01662444
## SSTRVOL     -0.05639856 -0.05689113 -0.05729961 -0.05775141 -0.05820102
## CPDIST5      0.01985223  0.02360910  0.02703591  0.03015002  0.03298007
## CPWVOL5     -0.43977377 -0.44790160 -0.45537130 -0.46206506 -0.46811179
##                                                                        
## (Intercept) 10.26238542 10.47377500 10.64662031 10.80215317 10.94649492
## feat         1.05242216  1.05386574  1.05518692  1.05639197  1.05749006
## price       -0.47472774 -0.47575131 -0.47667018 -0.47750566 -0.47826557
## AGE60        1.44849855  1.48110583  1.51084215  1.53596152  1.56052776
## EDUC         0.49452096  0.54388022  0.58935403  0.62992518  0.66867517
## ETHNIC       0.60830456  0.61087508  0.61490493  0.61864977  0.62185498
## INCOME      -0.01329684 -0.03539108 -0.05350640 -0.06968647 -0.08478533
## HHLARGE     -1.63296430 -1.58226543 -1.54466270 -1.51351056 -1.48272274
## WORKWOM     -0.41757163 -0.42116933 -0.42450328 -0.42926060 -0.43254975
## HVAL150      0.15395015  0.16036187  0.16423373  0.16776209  0.17054391
## SSTRDIST    -0.01741779 -0.01823734 -0.01897819 -0.01964469 -0.02025665
## SSTRVOL     -0.05972697 -0.06067713 -0.06164497 -0.06256063 -0.06340240
## CPDIST5      0.03644711  0.04027941  0.04363850  0.04667603  0.04946003
## CPWVOL5     -0.47122030 -0.47478142 -0.47799571 -0.48083104 -0.48344597
##                                                                        
## (Intercept) 11.07522765 11.19145067 11.30072940 11.39683237 11.48764570
## feat         1.05849097  1.05940272  1.06023305  1.06099039  1.06167989
## price       -0.47895837 -0.47959069 -0.48016634 -0.48069041 -0.48116793
## AGE60        1.58106803  1.59917250  1.61769847  1.63293056  1.64832449
## EDUC         0.70236418  0.73219792  0.76126014  0.78630060  0.81057737
## ETHNIC       0.62496864  0.62789620  0.63032756  0.63277106  0.63477730
## INCOME      -0.09815564 -0.11020039 -0.12163149 -0.13160166 -0.14109922
## HHLARGE     -1.45747423 -1.43525819 -1.41186068 -1.39344913 -1.37406094
## WORKWOM     -0.43682307 -0.44100682 -0.44346156 -0.44681121 -0.44889653
## HVAL150      0.17339440  0.17624207  0.17847118  0.18068769  0.18247531
## SSTRDIST    -0.02080819 -0.02130947 -0.02177282 -0.02218893 -0.02257335
## SSTRVOL     -0.06417305 -0.06486414 -0.06549315 -0.06607150 -0.06659892
## CPDIST5      0.05197603  0.05426335  0.05637189  0.05826656  0.06001662
## CPWVOL5     -0.48578067 -0.48790853 -0.48989281 -0.49166540 -0.49330596
##                                                                        
## (Intercept) 11.57121358 11.64322915 11.71188536 11.77026135 11.82688741
## feat         1.06230843  1.06288190  1.06340367  1.06387990  1.06431293
## price       -0.48160196 -0.48199724 -0.48235806 -0.48268636 -0.48298636
## AGE60        1.66297774  1.67442466  1.68597002  1.69508994  1.70446890
## EDUC         0.83350317  0.85258239  0.87102350  0.88628835  0.90129951
## ETHNIC       0.63653044  0.63839470  0.63990911  0.64153652  0.64283825
## INCOME      -0.14986750 -0.15733237 -0.16450424 -0.17054067 -0.17644738
## HHLARGE     -1.35564860 -1.34218400 -1.32772651 -1.31730714 -1.30557747
## WORKWOM     -0.45045686 -0.45309549 -0.45481101 -0.45713610 -0.45866473
## HVAL150      0.18385891  0.18538936  0.18668935  0.18805401  0.18922509
## SSTRDIST    -0.02292492 -0.02323826 -0.02352841 -0.02378680 -0.02402753
## SSTRVOL     -0.06708797 -0.06753414 -0.06793939 -0.06830343 -0.06863632
## CPDIST5      0.06161482  0.06303985  0.06436214  0.06553462  0.06663147
## CPWVOL5     -0.49480707 -0.49614294 -0.49737279 -0.49849026 -0.49950966
##                                                                        
## (Intercept) 11.87975982 11.92261018 11.96497650 12.00504062 12.04225384
## feat         1.06470747  1.06506803  1.06539574  1.06569419  1.06596618
## price       -0.48325930 -0.48350700 -0.48373366 -0.48394010 -0.48412785
## AGE60        1.71371279  1.72064946  1.72764825  1.73459765  1.74129792
## EDUC         0.91570604  0.92715928  0.93847225  0.94938842  0.95976570
## ETHNIC       0.64392219  0.64519666  0.64622738  0.64705284  0.64774361
## INCOME      -0.18199643 -0.18643376 -0.19084827 -0.19504941 -0.19896819
## HHLARGE     -1.29380428 -1.28630591 -1.27774795 -1.26889598 -1.26027079
## WORKWOM     -0.45961887 -0.46124841 -0.46249053 -0.46327605 -0.46377727
## HVAL150      0.19013303  0.19107511  0.19193456  0.19261802  0.19313645
## SSTRDIST    -0.02424881 -0.02444275 -0.02462476 -0.02479218 -0.02494560
## SSTRVOL     -0.06894211 -0.06921658 -0.06947031 -0.06970395 -0.06991969
## CPDIST5      0.06763884  0.06851314  0.06934020  0.07010313  0.07080217
## CPWVOL5     -0.50045096 -0.50131345 -0.50208183 -0.50278732 -0.50343493
##                                                                        
## (Intercept) 12.07003730 12.09834518 12.12570014 12.15153180 12.17560809
## feat         1.06621533  1.06644152  1.06664732  1.06683475  1.06700553
## price       -0.48429789 -0.48445364 -0.48459574 -0.48472514 -0.48484286
## AGE60        1.74611376  1.75083345  1.75553210  1.76012292  1.76452883
## EDUC         0.96752947  0.97525261  0.98275548  0.98994765  0.99677345
## ETHNIC       0.64866837  0.64945562  0.65006404  0.65054298  0.65093151
## INCOME      -0.20185225 -0.20479850 -0.20766187 -0.21037885 -0.21292045
## HHLARGE     -1.25559814 -1.25018337 -1.24432614 -1.23838530 -1.23259375
## WORKWOM     -0.46476790 -0.46569126 -0.46632786 -0.46673119 -0.46697443
## HVAL150      0.19366728  0.19421376  0.19467107  0.19502839  0.19529366
## SSTRDIST    -0.02507703 -0.02520260 -0.02531819 -0.02542417 -0.02552117
## SSTRVOL     -0.07010783 -0.07028157 -0.07044441 -0.07059554 -0.07073550
## CPDIST5      0.07138678  0.07195168  0.07247697  0.07296088  0.07340466
## CPWVOL5     -0.50405175 -0.50458856 -0.50507022 -0.50550904 -0.50591015
##                                    
## (Intercept) 12.19786954 12.21177344
## feat         1.06716120  1.06731495
## price       -0.48494990 -0.48504614
## AGE60        1.76869673  1.77160439
## EDUC         1.00320019  1.00753889
## ETHNIC       0.65125494  0.65182268
## INCOME      -0.21527655 -0.21673445
## HHLARGE     -1.22708796 -1.22476187
## WORKWOM     -0.46711712 -0.46750152
## HVAL150      0.19548036  0.19565647
## SSTRDIST    -0.02560982 -0.02568297
## SSTRVOL     -0.07086517 -0.07096744
## CPDIST5      0.07381050  0.07412383
## CPWVOL5     -0.50627641 -0.50666569
# Now ready for cross validation version of the object
cvfit <- cv.glmnet(x, y, alpha=1)
#Results
plot(cvfit)

cvfit$lambda.min
## [1] 0.0008909527
log(cvfit$lambda.min)
## [1] -7.023219
coef(cvfit, s = "lambda.min")
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                       1
## (Intercept) 12.19786954
## feat         1.06716120
## price       -0.48494990
## AGE60        1.76869673
## EDUC         1.00320019
## ETHNIC       0.65125494
## INCOME      -0.21527655
## HHLARGE     -1.22708796
## WORKWOM     -0.46711712
## HVAL150      0.19548036
## SSTRDIST    -0.02560982
## SSTRVOL     -0.07086517
## CPDIST5      0.07381050
## CPWVOL5     -0.50627641
# We see that orange juice is a substitute for other orange juice. Minute Maid has twice the cross price elasticity as Dominicks, which makes sense.


# The key this here is that the week variable is formatted as a date variable.  This provides R with some information that it is a panel dataset


 #create a date and sequence accompanying the dates within the dataframe then use lag operaters to make progress on it.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.