R Markdown

This is an R Markdown document. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

The syntax on line 19 then the subsequent on line 56 tells R Markdown to run R code without the asterisks embeds the subsequent code as embedded inside the R Markdown file

library(plyr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

setwd("C:/Users/jlariv/OneDrive/Econ 404/Exams/")
mydata <- read.csv("online_retail.csv")
colnames(mydata)
## [1] "InvoiceNo"   "StockCode"   "Description" "Quantity"    "InvoiceDate"
## [6] "UnitPrice"   "CustomerID"  "Country"
mydata$rev <- mydata$Quantity*mydata$UnitPrice

summary<-ddply(mydata, .(StockCode), summarize,  mean_price=mean(UnitPrice),sd_price=sd(UnitPrice), obs=length(UnitPrice))
## Warning: package 'bindrcpp' was built under R version 3.3.3
summary <- summary[order(-summary$obs),] 
#There looks to be a good amount of price variation for the most sold products

product_view <- mydata %>% 
  group_by(StockCode) %>% 
  summarise(rev = sum(rev), UnitPrice = mean(UnitPrice), AveSale = mean(Quantity), TotalQ = sum(Quantity))
product_view <- product_view[order(-product_view$rev),] 

product_view <- product_view[ which(product_view$rev>0 & product_view$UnitPrice>0 & product_view$TotalQ >0), ]

ggplot(product_view, aes(rev, TotalQ, color = rev)) +
  geom_point(shape = 16, size = 5, show.legend = FALSE, alpha = .4) +
  theme_minimal() +
  scale_color_gradient(low = "#0091ff", high = "#f0650e")

ggplot(product_view, aes(TotalQ, UnitPrice, color = rev)) +
  geom_point(shape = 16, size = 5, show.legend = FALSE, alpha = .4) +
  theme_minimal() +
  scale_color_gradient(low = "#0091ff", high = "#f0650e")

Money_makers <- product_view[ which(product_view$rev>75000), ]
df <- merge(mydata,Money_makers,by = "StockCode")

unique(df$Description)
##  [1] REGENCY CAKESTAND 3 TIER           damages                           
##  [3] faulty                             PARTY BUNTING                     
##  [5] JUMBO BAG RED RETROSPOT            WHITE HANGING HEART T-LIGHT HOLDER
##  [7] CREAM HANGING HEART T-LIGHT HOLDER wrongly marked carton 22804       
##  [9] ?                                  DOTCOM POSTAGE                    
## [11]                                   
## 4224 Levels:  ... ZINC WIRE SWEETHEART LETTER TRAY
#Looking at the data DOT seems to have multiple subscriptions but it looks like it makes this company a bunch of money!
#Lets drop DOT and focus on the real products.

df1 <- df[ which(df$StockCode!="DOT"), ]
unique(df1$Description)
## [1] REGENCY CAKESTAND 3 TIER           damages                           
## [3] faulty                             PARTY BUNTING                     
## [5] JUMBO BAG RED RETROSPOT            WHITE HANGING HEART T-LIGHT HOLDER
## [7] CREAM HANGING HEART T-LIGHT HOLDER wrongly marked carton 22804       
## [9] ?                                 
## 4224 Levels:  ... ZINC WIRE SWEETHEART LETTER TRAY
#Still looks like there are a bunch of strange Descriptions from the same StockCode
df1 <- df1[order(df1$Description),] 
#There is just a bunch of messy data.  This is normal and its life.  One suggestion is just to look at the data which looks "right" by examining a subset of the data with the most common Description.

JUMBO<- df1[ which(df1$Description=="JUMBO BAG RED RETROSPOT" & df1$UnitPrice.x>0 & df1$Quantity>0), ]

ggplot(JUMBO, aes(Quantity, UnitPrice.x, color = UnitPrice.x)) +
  geom_point(shape = 16, size = 5, show.legend = FALSE, alpha = .4) +
  theme_minimal() +
  scale_color_gradient(low = "#0091ff", high = "#f0650e")

#This will be important if you can figure out the http://stat.ethz.ch/R-manual/R-devel/library/base/html/as.POSIXlt.html bonus
df1$timestamp <- as.character(df1$InvoiceDate)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.