R Markdown

This is an R Markdown document. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

The syntax on line 19 then the subsequent on line 56 tells R Markdown to run R code without the asterisks embeds the subsequent code as embedded inside the R Markdown file

library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.3.3
setwd("C:/Users/jlariv/OneDrive/Econ 404/")
oj <- read.csv("oj.csv") 

## create some colors for the brands
levels(oj$brand)
## [1] "dominicks"   "minute.maid" "tropicana"
#brandcol <- c("green","red","gold")
par(mfrow=c(1,2))
ggplot(oj, aes(factor(brand), price)) + geom_boxplot(aes(fill = factor(brand)))

ggplot(oj, aes(factor(brand), log(price))) + geom_boxplot(aes(fill = factor(brand)))

ggplot(oj, aes(logmove, price)) + geom_point(aes(color = factor(brand)))

ggplot(oj, aes(price, logmove)) + geom_point(aes(color = factor(brand)))

ggplot(oj, aes(feat, brand)) + geom_point(position = "jitter", aes(color = factor(brand)))

#using aggregate from the base package; 5 and 6 are the location in the column of the dataframe
aggregate(oj[, 5:6], list(oj$brand), mean)
##       Group.1      feat    price
## 1   dominicks 0.2570215 1.735809
## 2 minute.maid 0.2885273 2.241162
## 3   tropicana 0.1662348 2.870493
#using ddplyr from the plyr package
ddply(oj, .(brand), summarize,  feat=mean(feat),price=mean(price))
##         brand      feat    price
## 1   dominicks 0.2570215 1.735809
## 2 minute.maid 0.2885273 2.241162
## 3   tropicana 0.1662348 2.870493
#use plyr to look at mean and SD of price by brand-featured combination
ddply(oj, .(brand,feat), summarize,  mean_price=mean(price),sd_price=sd(price), obs=length(price))
##         brand feat mean_price  sd_price  obs
## 1   dominicks    0   1.795460 0.3823584 7169
## 2   dominicks    1   1.563375 0.3415036 2480
## 3 minute.maid    0   2.328877 0.4080230 6865
## 4 minute.maid    1   2.024867 0.3014639 2784
## 5   tropicana    0   2.966131 0.5132730 8045
## 6   tropicana    1   2.390817 0.4614941 1604
#The melt command can "reshape" data to do this as well. If you get bored see if you can fiugre out the next line of code.
#grouped <- melt(oj, id.vars=c("brand","price","feat"))

## simple regression
reg = glm(logmove ~ log(price) + feat + brand, data=oj)
summary(reg)
## 
## Call:
## glm(formula = logmove ~ log(price) + feat + brand, data = oj)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.7593  -0.4457  -0.0042   0.4306   3.2375  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.27909    0.01451  708.22   <2e-16 ***
## log(price)       -2.52989    0.02172 -116.50   <2e-16 ***
## feat              0.89062    0.01046   85.13   <2e-16 ***
## brandminute.maid  0.68156    0.01177   57.91   <2e-16 ***
## brandtropicana    1.30176    0.01483   87.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.5036015)
## 
##     Null deviance: 30079  on 28946  degrees of freedom
## Residual deviance: 14575  on 28942  degrees of freedom
## AIC: 62298
## 
## Number of Fisher Scoring iterations: 2
## How do standard errors vary with the sample size?

subsample<-oj[sample(nrow(oj), .2*nrow(oj)), ]
reg_sub = glm(logmove ~ log(price)*feat + brand, data=subsample)
summary(reg_sub)
## 
## Call:
## glm(formula = logmove ~ log(price) * feat + brand, data = subsample)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.4893  -0.4384  -0.0174   0.4237   3.1394  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.11606    0.03544  285.47   <2e-16 ***
## log(price)       -2.35260    0.05177  -45.45   <2e-16 ***
## feat              1.59508    0.05973   26.71   <2e-16 ***
## brandminute.maid  0.69316    0.02634   26.32   <2e-16 ***
## brandtropicana    1.30183    0.03326   39.14   <2e-16 ***
## log(price):feat  -1.03531    0.08263  -12.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.5005465)
## 
##     Null deviance: 6220.8  on 5788  degrees of freedom
## Residual deviance: 2894.7  on 5783  degrees of freedom
## AIC: 12430
## 
## Number of Fisher Scoring iterations: 2
reg1 = glm(logmove ~ log(price)*brand*feat, data=oj)
summary(reg1)
## 
## Call:
## glm(formula = logmove ~ log(price) * brand * feat, data = oj)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.8893  -0.4290  -0.0091   0.4125   3.2368  
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      10.40658    0.02335 445.668  < 2e-16 ***
## log(price)                       -2.77415    0.03883 -71.445  < 2e-16 ***
## brandminute.maid                  0.04720    0.04663   1.012    0.311    
## brandtropicana                    0.70794    0.05080  13.937  < 2e-16 ***
## feat                              1.09441    0.03810  28.721  < 2e-16 ***
## log(price):brandminute.maid       0.78293    0.06140  12.750  < 2e-16 ***
## log(price):brandtropicana         0.73579    0.05684  12.946  < 2e-16 ***
## log(price):feat                  -0.47055    0.07409  -6.351 2.17e-10 ***
## brandminute.maid:feat             1.17294    0.08196  14.312  < 2e-16 ***
## brandtropicana:feat               0.78525    0.09875   7.952 1.90e-15 ***
## log(price):brandminute.maid:feat -1.10922    0.12225  -9.074  < 2e-16 ***
## log(price):brandtropicana:feat   -0.98614    0.12411  -7.946 2.00e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.4829706)
## 
##     Null deviance: 30079  on 28946  degrees of freedom
## Residual deviance: 13975  on 28935  degrees of freedom
## AIC: 61094
## 
## Number of Fisher Scoring iterations: 2
reg1 = lm(logmove ~ log(price)*brand*feat + AGE60 + EDUC + ETHNIC + INCOME+ HHLARGE + WORKWOM + HVAL150 + SSTRDIST
           + SSTRVOL + CPDIST5 + CPWVOL5, data=oj)
summary(reg1)
## 
## Call:
## lm(formula = logmove ~ log(price) * brand * feat + AGE60 + EDUC + 
##     ETHNIC + INCOME + HHLARGE + WORKWOM + HVAL150 + SSTRDIST + 
##     SSTRVOL + CPDIST5 + CPWVOL5, data = oj)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2004 -0.3922 -0.0049  0.3786  2.9884 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      13.143435   0.329622  39.874  < 2e-16 ***
## log(price)                       -2.854302   0.036816 -77.528  < 2e-16 ***
## brandminute.maid                  0.092348   0.044158   2.091  0.03651 *  
## brandtropicana                    0.772280   0.048102  16.055  < 2e-16 ***
## feat                              1.055010   0.036034  29.278  < 2e-16 ***
## AGE60                             2.044239   0.127506  16.032  < 2e-16 ***
## EDUC                              0.955931   0.101486   9.419  < 2e-16 ***
## ETHNIC                            0.643102   0.037229  17.274  < 2e-16 ***
## INCOME                           -0.279937   0.033300  -8.406  < 2e-16 ***
## HHLARGE                          -0.920092   0.230509  -3.992 6.58e-05 ***
## WORKWOM                          -0.424559   0.146824  -2.892  0.00384 ** 
## HVAL150                           0.357561   0.041803   8.554  < 2e-16 ***
## SSTRDIST                         -0.019979   0.001465 -13.634  < 2e-16 ***
## SSTRVOL                          -0.055948   0.009768  -5.728 1.03e-08 ***
## CPDIST5                           0.064160   0.006289  10.203  < 2e-16 ***
## CPWVOL5                          -0.500272   0.025703 -19.464  < 2e-16 ***
## log(price):brandminute.maid       0.753984   0.058086  12.981  < 2e-16 ***
## log(price):brandtropicana         0.713983   0.053736  13.287  < 2e-16 ***
## log(price):feat                  -0.405049   0.070053  -5.782 7.45e-09 ***
## brandminute.maid:feat             1.131698   0.077495  14.603  < 2e-16 ***
## brandtropicana:feat               0.746656   0.093353   7.998 1.31e-15 ***
## log(price):brandminute.maid:feat -1.079118   0.115559  -9.338  < 2e-16 ***
## log(price):brandtropicana:feat   -0.989109   0.117302  -8.432  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6568 on 28924 degrees of freedom
## Multiple R-squared:  0.5851, Adjusted R-squared:  0.5848 
## F-statistic:  1854 on 22 and 28924 DF,  p-value: < 2.2e-16
reg2 = lm(logmove ~ log(price)*brand*feat + log(price)*HHLARGE + feat*HHLARGE + log(price)*EDUC + log(price)*INCOME + AGE60 + EDUC + ETHNIC + HHLARGE + WORKWOM + HVAL150 + SSTRDIST
          + SSTRVOL + CPDIST5 + CPWVOL5, data=oj)
summary(reg2)
## 
## Call:
## lm(formula = logmove ~ log(price) * brand * feat + log(price) * 
##     HHLARGE + feat * HHLARGE + log(price) * EDUC + log(price) * 
##     INCOME + AGE60 + EDUC + ETHNIC + HHLARGE + WORKWOM + HVAL150 + 
##     SSTRDIST + SSTRVOL + CPDIST5 + CPWVOL5, data = oj)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5575 -0.3758 -0.0060  0.3632  2.9126 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      17.743378   0.588134  30.169  < 2e-16 ***
## log(price)                       -8.836571   0.611508 -14.450  < 2e-16 ***
## brandminute.maid                  0.096972   0.043090   2.250   0.0244 *  
## brandtropicana                    0.805184   0.047040  17.117  < 2e-16 ***
## feat                              1.028299   0.050330  20.431  < 2e-16 ***
## HHLARGE                           4.243096   0.472819   8.974  < 2e-16 ***
## EDUC                             -0.997972   0.170358  -5.858 4.73e-09 ***
## INCOME                           -0.731403   0.058832 -12.432  < 2e-16 ***
## AGE60                             2.039271   0.124414  16.391  < 2e-16 ***
## ETHNIC                            0.665650   0.036344  18.315  < 2e-16 ***
## WORKWOM                          -0.346427   0.143286  -2.418   0.0156 *  
## HVAL150                           0.383774   0.040794   9.408  < 2e-16 ***
## SSTRDIST                         -0.019730   0.001430 -13.796  < 2e-16 ***
## SSTRVOL                          -0.051052   0.009531  -5.356 8.57e-08 ***
## CPDIST5                           0.060879   0.006137   9.919  < 2e-16 ***
## CPWVOL5                          -0.502568   0.025080 -20.038  < 2e-16 ***
## log(price):brandminute.maid       0.749394   0.056679  13.222  < 2e-16 ***
## log(price):brandtropicana         0.686462   0.052492  13.077  < 2e-16 ***
## log(price):feat                  -0.392707   0.068368  -5.744 9.34e-09 ***
## brandminute.maid:feat             1.123165   0.075615  14.854  < 2e-16 ***
## brandtropicana:feat               0.734101   0.091122   8.056 8.17e-16 ***
## log(price):HHLARGE               -6.543458   0.497739 -13.146  < 2e-16 ***
## feat:HHLARGE                      0.188415   0.307068   0.614   0.5395    
## log(price):EDUC                   2.375998   0.174291  13.632  < 2e-16 ***
## log(price):INCOME                 0.583554   0.061208   9.534  < 2e-16 ***
## log(price):brandminute.maid:feat -1.071241   0.112753  -9.501  < 2e-16 ***
## log(price):brandtropicana:feat   -0.988814   0.114465  -8.639  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6408 on 28920 degrees of freedom
## Multiple R-squared:  0.6051, Adjusted R-squared:  0.6048 
## F-statistic:  1705 on 26 and 28920 DF,  p-value: < 2.2e-16

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.