rm(list = ls())
# Set up environment and load datasets
setwd("~/Desktop/Courses/GBA424/Assignments/Project 3")
load("GBA424 - Toy Horse Case Data.rdata")
require("cluster")
## Loading required package: cluster
require("fpc")
## Loading required package: fpc
require("factoextra")
## Loading required package: factoextra
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
require("gridExtra")
## Loading required package: gridExtra
library(cluster)
library(fpc)
library(factoextra)
library(gridExtra)
library(reshape)
################################
# PART A #
################################
## Use regression to estimate the conjoint model at the individual level
# Create new dataset for part A
indi_data = conjointData
# Subset the data to train the model
indi_training = indi_data[!(is.na(indi_data$ratings)),]
indi_missing = indi_data[is.na(indi_data$ratings),]
# Store the coefficients
numIDs = length(unique(conjointData$ID)) # Number of respondents
partworths1 = data.frame(ID = 1:numIDs, intercept = NA, price = NA,
size = NA, motion = NA, style = NA)
indi_pred = list() # List that saves predicted ratings of missing profiles
for (num in 1:numIDs){
data.training.sub = subset(indi_training, ID == num)
data.missing.sub = subset(indi_missing, ID == num)
lm = lm(ratings~price+size+motion+style,data = data.training.sub)
partworths1[num, 2:6] = lm$coefficients
indi_pred = append(indi_pred,predict(lm,data.missing.sub))
}
# Replace NA ratings with predicted ratings for missing profiles
indi_data$ratings[is.na(indi_data$ratings)] = unlist(indi_pred)
Part-utilities of the conjoint model at the individual level are stored in the ‘partworths1’. The NAs in the survey data are replaced by the predictions for missing profiles.
################################
# PART B #
################################
source("ConjointCode.R")
## Evaluate number of clusters to use on data with visualizations
checkClust = clustTest(partworths1[,2:6],print=TRUE,scale=TRUE,maxClusts=10,
seed=12345,nstart=20,iter.max=100)
clusts = runClusts(partworths1[,2:6],c(2,3,4,5),print=TRUE,maxClusts=4,
seed=12345,nstart=20,iter.max=100)
The optimal number of clusters is 3, where the average silhouette width is the highest, and the customer can be separated in to 3 non-overlapped groups of people with different preferences.
## Plot clusters with nClusters = 3
plotClust(clusts[[1]][[2]],partworths1)
# Cluster means
partworths1_seg = as.data.frame(clusts[[1]][[2]]$centers)
partworths1_seg
## intercept price size motion style
## 1 31.50291 22.474947 5.348628 -9.434120 -6.6285255
## 2 40.88116 11.564861 -6.561319 10.038882 0.4726337
## 3 46.13706 8.938311 16.593109 6.965209 10.9539010
In the post-hoc segmentation, we use 3 clusters. The sign and magnitude of attribute coefficients indicate the preference of consumers within certain attributes, with positive sign meaning consumers prefer that attribute. We can use this result to support our product line decision.
Ideal product for each segment
Segment 1: prefer lower price, bigger size, bouncing motion and racing style. → Profile 4
Segment 2: preder lower price, smaller size, rocking motion and glamour style. → Profile 14
Segment 3: preder lower price, bigger size, rocking motion and glamour style. → Profile 16
################################
# PART C #
################################
# Create new dataset for part C
seg_data = conjointData
## Conduct a priori segmentation using the variabkes gender and age
demo = as.data.frame(lapply(respondentData,as.factor)) # demographic info
# Create 3 segmentations by age, by gender, and by age & gender
demo_seg1 = kmeans(x=demo[,2:3], centers = 4, nstart = 1000) # age & gender (4 clusters)
demo_seg2 = kmeans(x=demo[,2], centers = 2, nstart = 1000) # age (2 clusters)
demo_seg3 = kmeans(x=demo[,3], centers = 2, nstart = 1000) # gender (2 clusters)
# Merge cluster id with the original conjoint data
cluster_id = data.frame(ID = demo$ID,
seg1=factor(demo_seg1$cluster),
seg2=factor(demo_seg2$cluster),
seg3=factor(demo_seg3$cluster))
seg_data = merge(seg_data, cluster_id,by = "ID", all.x = T)
# Subset training and missing data
seg_training = seg_data[!(is.na(seg_data$ratings)),]
seg_missing = seg_data[is.na(seg_data$ratings),]
To test whether priori segmentations affect part-utilities, we run regressions with interactions of the segment dummies with each attribute.
Segmentation 1: By age and gender
# Segmentation 1 by age and gender
summary(lm(ratings~price+size+motion+style+
price*seg1+size*seg1+
motion*seg1+style*seg1,
data=seg_training))[[4]]
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.3828391 1.310593 30.8126462 9.855947e-176
## price 13.6446118 1.228081 11.1105104 5.374151e-28
## size 9.4914412 1.175798 8.0723367 1.083173e-15
## motion 2.0536536 1.175798 1.7466034 8.083515e-02
## style 3.8353065 1.175798 3.2618740 1.122419e-03
## seg12 -5.3905360 2.082565 -2.5884120 9.700640e-03
## seg13 -2.6048124 1.931992 -1.3482519 1.777057e-01
## seg14 1.1196003 1.986693 0.5635497 5.731137e-01
## price:seg12 5.1194739 1.951452 2.6234178 8.760826e-03
## price:seg13 1.7460060 1.810359 0.9644529 3.349169e-01
## price:seg14 -0.3126248 1.861616 -0.1679319 8.666511e-01
## size:seg12 -3.7054922 1.868373 -1.9832722 4.745179e-02
## size:seg13 -7.1289830 1.733287 -4.1129851 4.038368e-05
## size:seg14 -3.9889956 1.782362 -2.2380394 2.531065e-02
## motion:seg12 -6.0030956 1.868373 -3.2130070 1.331076e-03
## motion:seg13 -0.3604476 1.733287 -0.2079561 8.352810e-01
## motion:seg14 1.9603507 1.782362 1.0998613 2.715038e-01
## style:seg12 -6.9041865 1.868373 -3.6952935 2.245875e-04
## style:seg13 -4.8176418 1.733287 -2.7794833 5.487330e-03
## style:seg14 -0.2488841 1.782362 -0.1396373 8.889584e-01
The interaction coefficients between segmentations and attributes are not entirely significant, so we consider testing whether gender or age is meaningful for business segmentation.
Segmentation 2: By age
# Segmentation 2 by age
summary(lm(ratings~price+size+motion+style+
price*seg2+size*seg2+
motion*seg2+style*seg2,
data=seg_training))[[4]]
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.54618223 1.0867243 36.39026357 4.223074e-231
## price 14.41328802 1.0183069 14.15416914 9.781338e-44
## size 3.85315922 0.9749546 3.95214210 7.970172e-05
## motion 2.79499918 0.9749546 2.86679924 4.182755e-03
## style 1.18667087 0.9749546 1.21715497 2.236654e-01
## seg22 -1.29820883 1.5292330 -0.84892805 3.960063e-01
## price:seg22 1.25883815 1.4329565 0.87849016 3.797661e-01
## size:seg22 4.17076027 1.3719514 3.04002051 2.391276e-03
## motion:seg22 -3.11880912 1.3719514 -2.27326508 2.309859e-02
## style:seg22 -0.08569567 1.3719514 -0.06246261 9.501997e-01
The segmentation here only affects part-utilities of size attribute.
Segmentation 3: By gender
# Segmentation 3 by gender
summary(lm(ratings~price+size+motion+style+
price*seg3+size*seg3+
motion*seg3+style*seg3,
data=seg_training))[[4]]
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.5668426 1.0739178 34.0499449 1.606196e-207
## price 16.8573429 1.0063067 16.7516948 1.215027e-59
## size 3.8509324 0.9634653 3.9969600 6.610460e-05
## motion -0.7601191 0.9634653 -0.7889429 4.302236e-01
## style -1.8895287 0.9634653 -1.9611797 4.997401e-02
## seg32 4.3032300 1.4614170 2.9445599 3.265352e-03
## price:seg32 -3.3487809 1.3694100 -2.4454188 1.454011e-02
## size:seg32 3.9045569 1.3111102 2.9780539 2.930023e-03
## motion:seg32 3.6668883 1.3111102 2.7967811 5.202755e-03
## style:seg32 5.6165245 1.3111102 4.2837927 1.910028e-05
The interation coefficients are significant in segmentations by gender.
Conclusion
From the significant effect of gender segmentation to all the four attributes, we can safely conclude that gender is the most meaningful factor to use for a priori segmentation. Meanwhile, age does not play such an important role as gender with insignificant effects to price and style.
We will use only gender to do priori demographic segmentation.
## Segment-level regressions
partworths2_seg = data.frame(cluster = 1:2, intercept = NA, price = NA,
size = NA, motion = NA, style = NA)
for (seg in 1:2){
data.sub = subset(seg_training, seg3 == seg)
lm = lm(ratings~price+size+motion+style, data=data.sub)
partworths2_seg[seg, 2:6] = lm$coefficients
}
partworths2_seg
## cluster intercept price size motion style
## 1 1 36.56684 16.85734 3.850932 -0.7601191 -1.889529
## 2 2 40.87007 13.50856 7.755489 2.9067692 3.726996
We’ll only get 2 sets of part-utilities instead of 200. But at least one set of part-utilities for attributes varies significantly across segments, and can be used for target different optimal products.
Ideal product for each segment
Segment 1: prefer lower price, bigger size, bouncing motion and racing style. → Profile 4
Segment 2: preder lower price, bigger size, rocking motion and glamour style. → Profile 16
#################################
# PART D #
#################################
# Prepare data for analysis
ratingData = cast(indi_data, ID ~ profile, value="ratings")
ratingData = ratingData[, -1] # Remove the ID column
# Function to calculate market share and deal with tie decisions
simFCSharesTie = function(scen,data,ascend=FALSE){
inmkt = data[,scen]
if(ascend){
bestOpts = apply(inmkt,1,min)
} else {
bestOpts = apply(inmkt,1,max)
}
decisions = inmkt == bestOpts
decisionsTie = decisions / rowSums(decisions)
mkShare = colSums(decisionsTie)/sum(decisionsTie)
mkShare
}
Set up scenarios
Our current products’ profile IDs are 5 and 13, and the competitor’s profile ID is 7. We will simulate the scenarios in which we launch ideal products from part B and part C, considering the competitor’s reponse by reducing his price.
Senarios:
Scenario | Our Products | Competitor’s Product |
---|---|---|
1 (Original) | 5, 13 | 7 |
2 (Part B) | 4, 14, 16 | 7 |
3 (Part B) | 4, 14, 16 | 8 |
4 (Part C) | 4, 16 | 7 |
5 (Part C) | 4, 16 | 8 |
6 | 14, 16 | 7 |
7 | 14, 16 | 8 |
## Set up scenarios
scens = list()
scens[[1]]=c(5,13,7)
scens[[2]]=c(4,14,16,7)
scens[[3]]=c(4,14,16,8)
scens[[4]]=c(4,16,7)
scens[[5]]=c(4,16,8)
scens[[6]]=c(14,16,7)
scens[[7]]=c(14,16,8)
## Market Share
sapply(scens,simFCSharesTie,data=ratingData, ascend=FALSE)
## [[1]]
## 5 13 7
## 0.22 0.21 0.57
##
## [[2]]
## 4 14 16 7
## 0.40 0.25 0.35 0.00
##
## [[3]]
## 4 14 16 8
## 0.355 0.220 0.340 0.085
##
## [[4]]
## 4 16 7
## 0.405 0.595 0.000
##
## [[5]]
## 4 16 8
## 0.355 0.465 0.180
##
## [[6]]
## 14 16 7
## 0.300 0.695 0.005
##
## [[7]]
## 14 16 8
## 0.230 0.365 0.405
In scenario 2, 4, 6, the competitor’s share decreases tremendously so we assume that he will decrease his price in response (i.e., changing from profile 7 to profile 8). Hence we remove these scenarios and move forward with scenario 1, 3, 5, 7. After simulating the market share, we will simulate short-term and long-term profitability.
## Simulate profitability
# Variable cost
variableCost = profilesData
variableCost$varCost[variableCost$size==0 & variableCost$motion==1] = 33 # 18" Rocking
variableCost$varCost[variableCost$size==1 & variableCost$motion==1] = 41 # 26" Rocking
variableCost$varCost[variableCost$size==0 & variableCost$motion==0] = 21 # 18" Bouncing
variableCost$varCost[variableCost$size==1 & variableCost$motion==0] = 29 # 26" Bouncing
# Function to calculate profitability over years
profitFunc = function(scen, data, year=1) {
marketShares = simFCSharesTie(scen, data, ascend=FALSE)
ourProducts = scen[-length(scen)] # exclude competitor's share
ourMarketShare = marketShares[1:length(ourProducts)]
quantity = ourMarketShare*4000
price = profilesData$priceLabel[profilesData$profile %in% ourProducts]*100/125
varCost = variableCost$varCost[variableCost$profile %in% ourProducts]
fixCost = 20000*length(ourProducts)*year +
sum(!(ourProducts %in% c(5, 13, 6, 14)))*1/3*20000
margin = (price-varCost)*quantity
profit = sum(margin)*year - fixCost
results = list(profit, margin)
results
}
First we calculate annual margin for each product in each scenario.
# Annual margin for each product
productMargin = lapply(scens[c(1, 3, 5, 7)],
function (x) profitFunc(x,
data=ratingData,
year=1)[[2]])
productMargin
## [[1]]
## 5 13
## 69512.96 66353.28
##
## [[2]]
## 4 14 16
## 95128.64 55432.96 74789.12
##
## [[3]]
## 4 16
## 95128.64 102285.12
##
## [[4]]
## 14 16
## 57952.64 80288.32
Then we look at overall profitability of the company over years.
# Calculate overall profit
profitData = matrix(nrow=10, ncol=4)
colnames(profitData) = c("'5,13,7'", "'4,14,16,8'", "'4,16,8'", "'14,16,8'")
rownames(profitData) = paste("Year", 1:10)
for (year in 1:10) {
profitData[year, ] = sapply(scens[c(1, 3, 5, 7)],
function (x) profitFunc(x,
data=ratingData,
year=year)[[1]])
}
profitData
## '5,13,7' '4,14,16,8' '4,16,8' '14,16,8'
## Year 1 95866.24 152017.4 144080.4 91574.29
## Year 2 191732.48 317368.1 301494.2 189815.25
## Year 3 287598.72 482718.8 458907.9 288056.21
## Year 4 383464.96 648069.5 616321.7 386297.17
## Year 5 479331.20 813420.3 773735.5 484538.13
## Year 6 575197.44 978771.0 931149.2 582779.09
## Year 7 671063.68 1144121.7 1088563.0 681020.05
## Year 8 766929.92 1309472.4 1245976.7 779261.01
## Year 9 862796.16 1474823.1 1403390.5 877501.97
## Year 10 958662.40 1640173.9 1560804.3 975742.93
Scenario 3 (2nd column), in which we sell profile 4, 14, 16 and the competitor sell profile 8, yields the highest profit both in short term and long term.
apply(profitData, 1, which.max)
## Year 1 Year 2 Year 3 Year 4 Year 5 Year 6 Year 7 Year 8 Year 9
## 2 2 2 2 2 2 2 2 2
## Year 10
## 2