R Markdown

This is an R Markdown document. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

The syntax on line 19 then the subsequent on line 56 tells R Markdown to run R code without the asterisks embeds the subsequent code as embedded inside the R Markdown file

library(rpart)
## Warning: package 'rpart' was built under R version 3.3.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.3.3
library(partykit)
## Warning: package 'partykit' was built under R version 3.3.3
## Loading required package: grid
# Some advanced packages to be aware of
#library(causalTree)
#library(permute)
library(maptree)
## Warning: package 'maptree' was built under R version 3.3.3
## Loading required package: cluster
#library(formattable)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(doBy)
## Warning: package 'doBy' was built under R version 3.3.3
#library(reshape2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.3
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.3.3
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: foreach
## Loaded glmnet 2.0-16
setwd("C:/Users/jlariv/OneDrive/Econ 404/")
mydata <- read.csv("oj.csv")
colnames(mydata)[1-17]
##  [1] "store"    "brand"    "week"     "logmove"  "feat"     "price"   
##  [7] "AGE60"    "EDUC"     "ETHNIC"   "INCOME"   "HHLARGE"  "WORKWOM" 
## [13] "HVAL150"  "SSTRDIST" "SSTRVOL"  "CPWVOL5"
mydata$Q <- exp(mydata$logmove)

#reduce decimal places
is.num=sapply(mydata, is.numeric)
mydata[is.num] = lapply(mydata[is.num], round, 4)

#Keeps a subset of the data which we care about.  Can merge demographic data back in later.
smaller <-mydata[c("store","brand","week","feat","price","Q")]
#Look to see what are unique elements of brand 
unique(smaller$brand)
## [1] tropicana   minute.maid dominicks  
## Levels: dominicks minute.maid tropicana
#Best possible way of doing the weighted means is using ddply
temp <- ddply(smaller, c('store','week'),function(x) c(weighted_mean = weighted.mean(x$price,x$Q)))
complete = merge(temp,mydata,by=c("store","week"))

#Need to collpse this by brand/price/Q/logmove
complete_tree <- complete[complete$brand=="dominicks",]

dataToPass<- complete_tree[,c("weighted_mean","AGE60","EDUC","ETHNIC","INCOME","HHLARGE","WORKWOM","HVAL150","SSTRDIST","SSTRVOL","CPDIST5","CPWVOL5")]
fit<-rpart(as.formula(weighted_mean ~ .),data=dataToPass,method="anova",cp=0.007)
draw.tree(fit)

#This command draws the tree
dataToPass$leaf = fit$where
#### Alternative code to do this in one line.
#complete$leaf = fit$where
#We find that there are three trees.  2 is the lowest home value and 5 is the highest.
L <- unique(dataToPass$leaf)

#NOTE: This is an issue becuase I had to create a crosswalk with a single variable.  It wouldn't keep the store numbers in the df I read into rpart if they weren't used
store_leaf_crosswalk <- summaryBy(leaf ~ HVAL150, data=dataToPass)
complete<- merge(mydata, store_leaf_crosswalk, by.x="HVAL150", by.y="HVAL150")


#Sanity Check regression (This is the first part of question 2)
reg = lm(logmove~log(price)*as.factor(leaf.mean)*feat, data=subset(complete, brand=="tropicana"))
summary(reg)
## 
## Call:
## lm(formula = logmove ~ log(price) * as.factor(leaf.mean) * feat, 
##     data = subset(complete, brand == "tropicana"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9491 -0.3517 -0.0052  0.3320  3.1819 
## 
## Coefficients:
##                                        Estimate Std. Error t value
## (Intercept)                           11.503134   0.065341 176.047
## log(price)                            -2.700209   0.061429 -43.956
## as.factor(leaf.mean)4                 -0.374997   0.078620  -4.770
## as.factor(leaf.mean)5                  0.336911   0.139131   2.422
## feat                                   1.767017   0.130017  13.591
## log(price):as.factor(leaf.mean)4       0.702305   0.073330   9.577
## log(price):as.factor(leaf.mean)5       0.735242   0.123937   5.932
## log(price):feat                       -1.390375   0.144018  -9.654
## as.factor(leaf.mean)4:feat             0.035973   0.157143   0.229
## as.factor(leaf.mean)5:feat            -0.584777   0.282415  -2.071
## log(price):as.factor(leaf.mean)4:feat -0.002069   0.173115  -0.012
## log(price):as.factor(leaf.mean)5:feat  0.627997   0.304170   2.065
##                                       Pr(>|t|)    
## (Intercept)                            < 2e-16 ***
## log(price)                             < 2e-16 ***
## as.factor(leaf.mean)4                 1.87e-06 ***
## as.factor(leaf.mean)5                   0.0155 *  
## feat                                   < 2e-16 ***
## log(price):as.factor(leaf.mean)4       < 2e-16 ***
## log(price):as.factor(leaf.mean)5      3.09e-09 ***
## log(price):feat                        < 2e-16 ***
## as.factor(leaf.mean)4:feat              0.8189    
## as.factor(leaf.mean)5:feat              0.0384 *  
## log(price):as.factor(leaf.mean)4:feat   0.9905    
## log(price):as.factor(leaf.mean)5:feat   0.0390 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5337 on 9637 degrees of freedom
## Multiple R-squared:  0.6038, Adjusted R-squared:  0.6034 
## F-statistic:  1335 on 11 and 9637 DF,  p-value: < 2.2e-16
#leaf specific regressions
#First reshape the data.  We want to create a price and feature variable for each store and each week.
temp <- subset(complete[,c("price", "brand","store", "week", "logmove", "leaf.mean", "feat")])
temp <- reshape(temp, timevar = "brand", idvar = c("store", "week", "leaf.mean"), direction = "wide")

#NOTE: This is a useful trick to create unique identifiers if you ever need to
#temp$storeweek <- temp$store*10000+temp$week

# Create a vector of the LHS variables we'll eventually want to use.
varlist <- names(temp)[c(5,8,11)]

#Create a matrix to store the coefficients in
mm <- data.frame(matrix(NA, length(varlist)*length(L), length(L)+1))
colnames(mm) <- c("Dom","MinMaid","Trop","leaf")
rownames(mm) <- c("Dom2","MinMaid2","Trop2","Dom4","MinMaid4","Trop4","Dom5","MinMaid5","Trop5")

featured <- data.frame(matrix(NA, length(varlist)*length(L), length(L)+1))
colnames(featured) <- c("Dom","MinMaid","Trop","leaf")
rownames(featured) <- c("Dom2","MinMaid2","Trop2","Dom4","MinMaid4","Trop4","Dom5","MinMaid5","Trop5")

feature_elasticity <- data.frame(matrix(NA, length(varlist)*length(L), length(L)+1))
colnames(feature_elasticity) <- c("Dom","MinMaid","Trop","leaf")
rownames(feature_elasticity) <- c("Dom2","MinMaid2","Trop2","Dom4","MinMaid4","Trop4","Dom5","MinMaid5","Trop5")


for (i in 1:length(L)) {
  temp_df <- subset(temp,leaf.mean==L[i])
   
  regd = lm(logmove.dominicks~log(price.dominicks)*feat.dominicks+log(price.minute.maid)*feat.minute.maid+log(price.tropicana)*feat.tropicana, data=temp_df)
  regm = lm(logmove.minute.maid~log(price.dominicks)*feat.dominicks+log(price.minute.maid)*feat.minute.maid+log(price.tropicana)*feat.tropicana, data=temp_df)
  regt = lm(logmove.tropicana~log(price.dominicks)*feat.dominicks+log(price.minute.maid)*feat.minute.maid+log(price.tropicana)*feat.tropicana, data=temp_df)
  
  mm$Dom[(i-1)*length(L) +1]=coef(regd)[2]
  mm$MinMaid[(i-1)*length(L) +1]=coef(regd)[4]
  mm$Trop[(i-1)*length(L) +1]=coef(regd)[6]
  mm$leaf[(i-1)*length(L) +1]=L[i]
  
  mm$Dom[(i-1)*length(L) +2]=coef(regm)[2]
  mm$MinMaid[(i-1)*length(L) +2]=coef(regm)[4]
  mm$Trop[(i-1)*length(L) +2]=coef(regm)[6]
  mm$leaf[(i-1)*length(L) +2]=L[i]
  
  mm$Dom[(i-1)*length(L) +3]=coef(regt)[2]
  mm$MinMaid[(i-1)*length(L) +3]=coef(regt)[4]
  mm$Trop[(i-1)*length(L) +3]=coef(regt)[6]
  mm$leaf[(i-1)*length(L) +3]=L[i]
  
  featured$Dom[(i-1)*length(L) +1]=coef(regd)[3]
  featured$MinMaid[(i-1)*length(L) +1]=coef(regd)[5]
  featured$Trop[(i-1)*length(L) +1]=coef(regd)[7]
  featured$leaf[(i-1)*length(L) +1]=L[i]
  
  featured$Dom[(i-1)*length(L) +2]=coef(regm)[3]
  featured$MinMaid[(i-1)*length(L) +2]=coef(regm)[5]
  featured$Trop[(i-1)*length(L) +2]=coef(regm)[7]
  featured$leaf[(i-1)*length(L) +2]=L[i]
  
  featured$Dom[(i-1)*length(L) +3]=coef(regt)[3]
  featured$MinMaid[(i-1)*length(L) +3]=coef(regt)[5]
  featured$Trop[(i-1)*length(L) +3]=coef(regt)[7]
  featured$leaf[(i-1)*length(L) +3]=L[i]
  
  feature_elasticity$Dom[(i-1)*length(L) +1]=coef(regd)[8]
  feature_elasticity$MinMaid[(i-1)*length(L) +1]=coef(regd)[9]
  feature_elasticity$Trop[(i-1)*length(L) +1]=coef(regd)[10]
  feature_elasticity$leaf[(i-1)*length(L) +1]=L[i]
  
  feature_elasticity$Dom[(i-1)*length(L) +2]=coef(regm)[8]
  feature_elasticity$MinMaid[(i-1)*length(L) +2]=coef(regm)[9]
  feature_elasticity$Trop[(i-1)*length(L) +2]=coef(regm)[10]
  feature_elasticity$leaf[(i-1)*length(L) +2]=L[i]
  
  feature_elasticity$Dom[(i-1)*length(L) +3]=coef(regt)[8]
  feature_elasticity$MinMaid[(i-1)*length(L) +3]=coef(regt)[9]
  feature_elasticity$Trop[(i-1)*length(L) +3]=coef(regt)[10]
  feature_elasticity$leaf[(i-1)*length(L) +3]=L[i]
  
} 

mm
##                  Dom    MinMaid       Trop leaf
## Dom2     -2.99923415  0.9987842 -0.4388561    2
## MinMaid2  0.58920445 -2.8869155  0.5446749    2
## Trop2     0.24067470  0.2673355 -2.7943267    2
## Dom4     -2.81793935  1.1613242 -0.1902201    4
## MinMaid4  0.64218681 -2.3576936  0.6188461    4
## Trop4     0.19322273  0.3839389 -2.1068391    4
## Dom5     -3.12180413  0.8630777 -0.3477877    5
## MinMaid5  0.21323592 -2.0289084  0.3154762    5
## Trop5    -0.01304219  0.1717764 -1.9797173    5
# The first order take away is that in leaf two stores, the price of Tropicana should be reduced relative to stores in the other two leaves.  
featured
##                  Dom    MinMaid       Trop leaf
## Dom2      1.01411253 0.02005868 -1.4238708    2
## MinMaid2 -0.17994876 1.81966332  0.7311099    2
## Trop2     0.07346138 0.15246705  1.7311293    2
## Dom4      0.98094303 0.25649852 -1.1362970    4
## MinMaid4 -0.04603263 1.77693351  0.7070967    4
## Trop4     0.11314649 0.28212956  1.8137720    4
## Dom5      0.88515348 0.62432522 -1.1972944    5
## MinMaid5 -0.16932436 1.63241491  0.3888789    5
## Trop5     0.04199285 0.27619417  1.2309810    5
# Featuring is not worth very much for Tropicana in leaf 5 stores.  They should avoid paying for end caps in those stores.  
feature_elasticity
##                  Dom    MinMaid       Trop leaf
## Dom2     -0.32890986 -0.1871111  1.4810734    2
## MinMaid2  0.17866014 -1.0354285 -1.0267289    2
## Trop2    -0.26297198 -0.3303132 -1.3764078    2
## Dom4     -0.29689679 -0.4112152  1.1566386    4
## MinMaid4 -0.02563148 -1.0938931 -0.9374040    4
## Trop4    -0.31373123 -0.4611319 -1.4232922    4
## Dom5     -0.08768507 -0.7422086  1.2497859    5
## MinMaid5  0.13835524 -1.0796766 -0.5269599    5
## Trop5    -0.31539000 -0.4201279 -0.8053661    5
# Nothing major to see here; also remember that we don't really trust the interaction of being featured with price anyway.

temp_df <- subset(temp,leaf.mean==5)
regm = lm(logmove.minute.maid~log(price.dominicks)*feat.dominicks+log(price.minute.maid)*feat.minute.maid+log(price.tropicana)*feat.tropicana, data=temp_df)
summary(regm)
## 
## Call:
## lm(formula = logmove.minute.maid ~ log(price.dominicks) * feat.dominicks + 
##     log(price.minute.maid) * feat.minute.maid + log(price.tropicana) * 
##     feat.tropicana, data = temp_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.57579 -0.28373 -0.02425  0.26362  1.56030 
## 
## Coefficients:
##                                         Estimate Std. Error t value
## (Intercept)                             10.48825    0.13412  78.199
## log(price.dominicks)                     0.21324    0.08042   2.652
## feat.dominicks                          -0.16932    0.09298  -1.821
## log(price.minute.maid)                  -2.02891    0.11943 -16.988
## feat.minute.maid                         1.63241    0.18412   8.866
## log(price.tropicana)                     0.31548    0.09655   3.267
## feat.tropicana                           0.38888    0.21770   1.786
## log(price.dominicks):feat.dominicks      0.13836    0.17270   0.801
## log(price.minute.maid):feat.minute.maid -1.07968    0.23612  -4.572
## log(price.tropicana):feat.tropicana     -0.52696    0.23218  -2.270
##                                         Pr(>|t|)    
## (Intercept)                              < 2e-16 ***
## log(price.dominicks)                     0.00820 ** 
## feat.dominicks                           0.06904 .  
## log(price.minute.maid)                   < 2e-16 ***
## feat.minute.maid                         < 2e-16 ***
## log(price.tropicana)                     0.00114 ** 
## feat.tropicana                           0.07450 .  
## log(price.dominicks):feat.dominicks      0.42333    
## log(price.minute.maid):feat.minute.maid 5.74e-06 ***
## log(price.tropicana):feat.tropicana      0.02355 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4542 on 673 degrees of freedom
## Multiple R-squared:  0.7175, Adjusted R-squared:  0.7137 
## F-statistic: 189.9 on 9 and 673 DF,  p-value: < 2.2e-16
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
# Is MSE less for a forest versus a LASSO  for the same model?  
# LASSO
x <- model.matrix(~ log(price) + feat + brand + AGE60 + EDUC + ETHNIC + INCOME + HHLARGE + WORKWOM + HVAL150 + SSTRDIST + SSTRVOL + CPDIST5 + CPWVOL5, data=mydata)
y <- as.numeric(as.matrix(mydata$logmove))
set.seed(720)
#lasso_v1 <- cv.glmnet(x, y, alpha=1)
lasso_v1 <- cv.glmnet(x, y, alpha=1)
mse.min <- lasso_v1$cvm[lasso_v1$lambda == lasso_v1$lambda.min]
mse.min
## [1] 0.4514462
# Turn price into log(price) to save a bit of time on this hackjob
mydata$price <- log(mydata$price)

oj.rf <- randomForest(logmove ~ ., data = mydata, ntree = 100, keep.forest = TRUE)
mydata$pred_logmove_rf = predict(oj.rf)
mydata$resid2 <- (mydata$logmove - mydata$pred_logmove_rf)^2
mean(mydata$resid2)
## [1] 0.005584419

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.