##v2 
#calculate forest for each brand separately [require initial dcast to include lagged other prices in prediction]
#remove all contemporaneous vars from forest stage
##remember--
##"there's always a way to write it better"
##"you're doing it wrong"
##"premature optimization is the root of all evil"
##load data, make slight modifications##

rm(list=ls())
#setwd("C:/Users/Thor/Dropbox/Spring2019/Econ 487")
oj <- data.frame(read.csv("C:/Users/Thor/Dropbox/Spring2019/Econ 487/oj.csv"))

names(oj)[6]="p"; names(oj)[4]="q"; 
oj$p <- log(oj$p)
oj_ol <- oj  #a copy to be modified
##make major modifications -- dcast, lags, store/week dums, interactions##

#dcast to gain price of each brand for each store/week
library(reshape2)
oj_wide2 <- dcast(oj_ol, store + week ~ brand, value.var="p")
names(oj_wide2)[3:5]=c("p_D","p_M","p_T")
oj_wide3 <- dcast(oj_ol, store + week ~ brand, value.var="q")
names(oj_wide3)[3:5]=c("q_D","q_M","q_T")
oj_wide <- merge(oj_wide2, oj_wide3, by=c("week","store"))
#adds prices as _.x, quantities as _.y
oj_ol <- merge(oj_ol, oj_wide, by=c("week","store"))


#lags for price, quantity, feat
lag = 5
for(i in 1:lag){
  Df1 <- oj_ol
  Df1$week <- Df1$week+i
  myvars = c("p_D","p_M","p_T","q_D","q_M","q_T")
  myvars = append(myvars, c('feat','week','brand','store'))
  Df1 <- Df1[myvars]
  oj_ol <- merge(oj_ol, Df1, by=c("brand","store","week"))
  for(j in 1:(length(myvars)-3)){
    names(oj_ol)[ncol(oj_ol)-(length(myvars)-3-j)]=paste(myvars[j],'_l',i,sep='')
  }
  names(oj_ol)[18:23] = myvars[1:6]
  #names(oj_ol)[6] <- "p"; names(oj_ol)[4] <- "q"; 
  names(oj_ol)[5] <- "feat"
}

#dummies for store and week (there will be many...)
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 3.5.3
oj_ol <- fastDummies::dummy_cols(oj_ol, select_columns = c("store","week"))
dim(oj_ol)
## [1] 18537   247
#interaction terms
# 1lagged p, q, feat with AGE60, EDUC, INCOME, HVAL150  ##could be integrated into a for loop if wanted more lag*interacts
interacts <- matrix(c(oj_ol$p_l1*oj_ol$AGE60, oj_ol$p_l1*oj_ol$EDUC, oj_ol$p_l1*oj_ol$INCOME, oj_ol$p_l1*oj_ol$HVAL150,
                      oj_ol$q_l1*oj_ol$AGE60, oj_ol$q_l1*oj_ol$EDUC, oj_ol$q_l1*oj_ol$INCOME, oj_ol$q_l1*oj_ol$HVAL150,
                      oj_ol$feat_l1*oj_ol$AGE60, oj_ol$feat_l1*oj_ol$EDUC, oj_ol$feat_l1*oj_ol$INCOME, oj_ol$feat_l1*oj_ol$HVAL150)
                    , ncol=12)
oj_ol <- cbind(oj_ol,interacts)    #note don't need to create interacts, but aids readability

#issue when the names of the last 12 entered into Forest...had to rename
my_names <- c("AGE60","EDUC","INCOME","HVAL150")
for(i in 1:length(my_names)){
  names(oj_ol)[ncol(oj_ol)-12+i] = paste('p_',my_names[i],sep='')
  names(oj_ol)[ncol(oj_ol)-12+i+4] = paste('q_',my_names[i],sep='')
  names(oj_ol)[ncol(oj_ol)-12+i+8] = paste('feat_',my_names[i],sep='')
}
##estimate f and g using random forest##

#drop contemporaneous variables else bias prediction (requires futher research...)
oj_ol <- oj_ol[,-which(names(oj_ol)=='feat')]
oj_ol <- oj_ol[,-(which(names(oj_ol)=='p_D'):which(names(oj_ol)=='q_T'))]

brands = c('dominicks','minute.maid','tropicana')
#with feat
res=data.frame(rep(1,dim(oj_ol[which(oj_ol$brand==brands[1]),])[1]))

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
for(i in 1:length(brands)){
  sub_ = oj_ol[which(oj_ol$brand==brands[i]),] 
  q_forest <- randomForest(q~.,data=sub_[,-which(names(sub_)=="p")], ntree=25, keep.forest=TRUE)
  p_forest <- randomForest(p~.,data=sub_[,-which(names(sub_)=="q")], ntree=25, keep.forest=TRUE)
  res[,(i-1)*2+1] = sub_$q - predict(q_forest)
  res[,(i-1)*2+2] = sub_$p - predict(p_forest)
  names(res)[(i-1)*2+1] = paste("q_",brands[i],"_res",sep='')
  names(res)[(i-1)*2+2] = paste("p_",brands[i],"_res",sep='')
}

#without feat
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
oj_ol_no_feat <- select(oj_ol, -starts_with("feat"))
res_nf=data.frame(rep(1,dim(oj_ol[which(oj_ol$brand==brands[1]),])[1]))

for(i in 1:length(brands)){
  sub_ = oj_ol_no_feat[which(oj_ol_no_feat$brand==brands[i]),] 
  q_forest <- randomForest(q~.,data=sub_[,-which(names(sub_)=="p")], ntree=25, keep.forest=TRUE)
  p_forest <- randomForest(p~.,data=sub_[,-which(names(sub_)=="q")], ntree=25, keep.forest=TRUE)
  res_nf[,(i-1)*2+1] = sub_$q - predict(q_forest)
  res_nf[,(i-1)*2+2] = sub_$p - predict(p_forest)
  names(res_nf)[(i-1)*2+1] = paste("q_",brands[i],"_res_nf",sep='')
  names(res_nf)[(i-1)*2+2] = paste("p_",brands[i],"_res_nf",sep='')
}
##estimate elasticities using residuals from ML prediction

#final regressions for elasticities by brand
elast = data.frame("brand"=c(brands))
co = c()
co_nf = c()

for(i in 1:length(brands)){
  fit <- lm(res[,(i-1)*2+1] ~ res[,(i-1)*2+2], data=res)
  co = c(co, coef(fit)[2])
  fit_nf <- lm(res_nf[,(i-1)*2+1] ~ res_nf[,(i-1)*2+2], data=res_nf)
  co_nf = c(co_nf, coef(fit_nf)[2])
}

elast <- cbind(elast, co, co_nf)

names(elast)[2]="elasticities"
names(elast)[3]="elasticities_nf"
elast
##         brand elasticities elasticities_nf
## 1   dominicks    -1.905309       -2.090071
## 2 minute.maid    -1.742731       -1.963868
## 3   tropicana    -2.327439       -2.659529