# Set up environments
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(formattable)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(stringr)
dir = "~/Desktop/Courses/GBA424/Assignments/Project 2"
setwd(dir)
################################
# SQL CODE #
################################
# SQL command to extract `storeItemSales.csv` and `itemAttributes.csv`
# select * from storeItemSales
# select * from itemAttributes
# Load datasets
# For question 1, 2
itemsAttributes = read.csv('itemsAttributes.csv')
storeItemSales = read.csv('storeItemSales.csv')
# For question 3, 4
survResponses = read.csv('survResponses.csv')
survQuestions = read.csv('survQuestions.csv')
We chose sales data from all the stores because it is the most representative data for the whole population.
# Merge sales data table with item attributes data table
salesData = merge(itemsAttributes, storeItemSales, right_on='Item.Num')
For Greek yogurt, we only include six flavors mentioned in the case as currently existing flavors.
################################
# OUTPUT FOR SLIDE 2 #
################################
existingFlavor=c('plain','strawberry','blueberry','honey','vanilla','peach')
# calculate the percentage of sales
greekYogurt=
salesData %>%
filter(Class=='GREEK', Flavor1 %in% existingFlavor) %>% # Where
group_by(Flavor1) %>% # Group by
summarize(total_sales=sum(Sales)) %>% # Aggregation
mutate(Percentage=percent(total_sales/sum(total_sales))) %>% # Calculate percentage in a new column
arrange(desc(Percentage)) # Order by
# PLOT PIE CHART
library(ggplot2)
# Create a bar plot
bp<- ggplot(greekYogurt, aes(x="", y=Percentage, fill=Flavor1))+geom_bar(width = 1, stat = "identity")
# Create a pie chart with title
pie <- bp + coord_polar("y", start=0)+ggtitle('Greek Yogurt Flavors')
# Define the theme with blank grids
blank_theme <- theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)
# Add labels to the pie chart
pie <- pie + scale_fill_brewer(palette="Paired")+theme_minimal()+blank_theme+theme(axis.text = element_blank(),axis.ticks = element_blank(),panel.grid = element_blank())+geom_text(aes(label=Percentage),position = position_stack(vjust = 0.5))
pie
For regular yogurt, we included all the flavors. We showed the percentage data of the top 10 popular flavors and use “Others” to represent all the other flavors.
################################
# OUTPUT FOR SLIDE 3 #
################################
# Calculate the percentage of sales
regular =
salesData %>%
filter(Class=='REGULAR') %>% # where
group_by(Flavor1) %>% # group by
summarize(total_sales=sum(Sales)) %>% #aggregation
mutate(Percentage=percent(total_sales/sum(total_sales))) %>% #create a new column
arrange(desc(Percentage)) # order by
# Change the category of flavors outside the top10 popular flavors as "Others"
levels(regular$Flavor1) <- c(levels(regular$Flavor1), "Others")
regular[11:82,1] = "Others"
# Calculate the percentage of sales
Regular_Yogurt =
regular %>%
group_by(Flavor1) %>%# group by %>% #aggregation
summarize(Percentage=sum(Percentage)) %>%
arrange(desc(Percentage))
# PLOT PIE CHART
# create a bar plot
bp2<- ggplot(Regular_Yogurt, aes(x="", y=Percentage, fill=Flavor1))+geom_bar(width = 1, stat = "identity")
# create a pie chart with title
pie2 <- bp2 + coord_polar("y", start=0)+ggtitle('Regular Yogurt Flavors')
# add labels to the pie chart
pie2 <- pie2 + scale_fill_brewer(palette="Paired")+theme_minimal()+blank_theme+theme(axis.text = element_blank(),axis.ticks = element_blank(),panel.grid = element_blank())+geom_text(aes(label=Percentage),position = position_stack(vjust = 0.5))
pie2
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (position_stack).
Before conducting analysis, first we needed to do some data cleaning and processing.
# Change column names of `survResponses` in `Question 12` to flavors' name
# Extract flavor names from `survQuestions`. Before running this line, we identify an error in survQuestions.csv's Q3_8 column and fix it with Excel.
flavorNames = sapply(survQuestions[1, ], function (x) {substring(x, 105, nchar(as.character(x), type="chars") - 6)})
# Change column names of `survResponses`
names(survResponses)[c(15:37)] <- flavorNames[c(15:37)]
We removed unreliable survey samples which: - are incomplete (V10 == 0), - skipped Question 12 entirely, and - take longer than 30 minutes to complete.
# Remove incomplete answers
sum(survResponses$V10 == 0) # 129 incomplete answers
## [1] 129
survResponses <- survResponses[survResponses$V10 != 0, ]
# Remove samples which skipped Q12 entirely
emptyQ12 <- apply(survResponses[, c(15:37)], 1, function(x) {all(is.na(x))})
sum(emptyQ12) # 19 people skipped Q12 entirely
## [1] 19
survResponses <- survResponses[!emptyQ12, ]
# Calculate time taken to answer survey
survResponses$V8 <- ymd_hms(survResponses$V8)
survResponses$V9 <- ymd_hms(survResponses$V9)
survResponses$timeElapsed <- as.numeric(difftime(survResponses$V9, survResponses$V8, units="mins"))
# Remove answers took more than 30 minutes to complete
longAnswer <- survResponses$timeElapsed > 30
sum(longAnswer) # 25 unreliable answers
## [1] 25
survResponses <- survResponses[!longAnswer, ]
# After remove unreliable responses, there are 579 samples left.
dim(survResponses)
## [1] 579 38
After removing unreliable samples, we assumed that NA values in Question 12 columns mean Never
and replaced these NA values with 2
.
# Replace missing values with value `2`
flavors <- survResponses[, c(15: 37)]
flavors[is.na(flavors)] <- 2
After data cleaning, we exported our data and use Tableau to create visualization for the our 4th slides. The code below will create similar bar charts.
################################
# OUTPUT FOR SLIDE 4 #
################################
# Export data for visualization in Tableau
flavorData <- ifelse(flavors==0, "Regular", ifelse(flavors==2, "Never", "Occasionally"))
write.csv(flavorData,'flavorData')
# Create bar chart to understand preference
par(mfrow=c(3, 3));
for (i in 1:length(colnames(flavorData))) {
x = flavorData[,i]
plot(factor(x),main=colnames(flavorData)[i])
}
Before conducting TURF analysis, we will encode the data as following: - Regularly: 4 - Occasionally: 1 - Never: 0
Originally, “Regularly” is encoded as 0, “Occasionally” as 1, and “Never” as 2.
# Encode data
flavorPurchase <- ifelse(flavors==0, 4, ifelse(flavors==2, 0, 1))
The code below will be used to do TURF Analysis. The codes are taken from WegmansSurveyCase.Rmd
file provided by Prof. Lovett. We created a new function measFreq
to measure total frequency.
#measReach: measures reach given set of options and data
measReach = function(data){ #measures reach given expected data
if(is.null(dim(data))){ #if data is a vector
ret = sum(data>1,na.rm=TRUE)/length(data)
} else if(ncol(data)==1){ #if data has only one column
ret = sum(data>1,na.rm=TRUE)/length(data)
}
else { #if data has multiple columns
ret = sum(apply(data>1,1,any),na.rm=TRUE)/nrow(data) #any: is at least one of the values true?
}
}
#measFreq: measures frequency given set of options and data
measFreq = function(data){
if(is.null(dim(data))){ #if data is a vector
ret = sum(data,na.rm=TRUE)
} else if(ncol(data)==1){ #if data has only one column
ret = sum(data,na.rm=TRUE)
}
else { #if data has multiple columns
ret = sum(apply(data>1,1,sum),na.rm=TRUE)
}
ret
}
#evalNext: evaluates the next set, nextSet using measure given existing set in data
evalNext = function(nextSet,set,data,measure=measReach){ #evaluates each option in a set for total reach
vals = numeric(length(nextSet)) #set up storage for return value
for(k in 1:length(nextSet)){ #loop over the options in nextSet
if(length(set)==0){ #if no existing options
vals[k] = measure(data[,nextSet[k]])
} else { #if existing options
vals[k] = measure(data[,c(set,nextSet[k])])
}
}
vals
}
#evalFull: creates optimal full evaluation starting from origSet and considering remaining options fullSet
evalFull = function(fullSet,data,origSet=numeric(0),measure=measReach){
#evaluates full set of cases picking optimal at each stage, returns a TURF object
curSet = origSet; #the current set of included options
remSet = fullSet[!(fullSet%in%origSet)]; #the remaining set of options to consider
K = length(remSet)
optVals = numeric(K); #create storage for the optimal values (optVals)
ordSet = numeric(K); #create storage for ordered set
for(i in 1:K){ #loop over the remaining set consider
tmpVals = evalNext(remSet,curSet,data,measure); #calculate vector of next evaluations
k = which.max(tmpVals) #pick the option that gives max measure, note will pick first case if a tie!
optVals[i] = tmpVals[k] #add optimal value
ordSet[i] = remSet[k] #add index of option that creates optimal value
curSet = c(curSet,ordSet[i]); #add optimal next option to current set
remSet = remSet[-k]; #delete optimal next option from remaining set
}
#creaets a "TURF object" containing ordSet, optVals, origSet, origVal, measure, and pnames
turf = list(ordSet=ordSet,optVals=optVals,origSet=origSet,origVal=measure(data[,origSet]),measure=measure,pnames=colnames(data))
class(turf)="TURF" #makes the list into a TURF object so that can call plot.TURF
turf #return turf
}
#creates ggplot barplot for a turf object
plot.TURF=function(turf,...){
if(class(turf)!="TURF"){
cat("Object not a turf.") #concatenate and print
} else {
df = with(turf,data.frame(vals = c(origVal,optVals),
titles=paste(0:length(ordSet),
c("Original",pnames[ordSet]),sep=":")))
#with(turf,barplot(c(origVal,optVals),names.arg=c("Original",pnames[ordSet])))
dodge = position_dodge(width=.75); ##to form constant dimensions positioning for all geom's
df$titles <- factor(df$titles, levels=df$titles)
gp = ggplot(df,aes(y=vals,x=titles))
gp + geom_bar(position=dodge,stat="identity",col=1,fill="lightblue",width=.75) + labs(title="Turf Analysis", x="Flavors") + scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
}
}
We will measure Reach and Frequency with and without the original set.
################################
# OUTPUT FOR SLIDE 5 #
################################
originSet <- which(colnames(flavorPurchase) %in% c("Blueberry", "Honey", "Peach",
"Plain", "Strawberry", "Vanilla"))
fullSet <- 1:length(colnames(flavorPurchase))
# Reach with no original set
turf.1 = evalFull(fullSet, flavorPurchase, measure=measReach)
plot.TURF(turf.1)
# Reach with the original set
turf.2 = evalFull(fullSet, flavorPurchase, originSet, measure=measReach)
plot.TURF(turf.2)
# Frequency with no original set
turf.3 = evalFull(fullSet, flavorPurchase, measure=measFreq)
plot.TURF(turf.3)
# Frequency with the original set
turf.4 = evalFull(fullSet, flavorPurchase, originSet, measure=measFreq)
plot.TURF(turf.4)
19 3 13 18 16 / 16 12 3 1 2