A supermarket is offering a new line of organic products. The supermarket’s management wants to determine which customers are likely to purchase these products. The supermarket has a loyalty program (think Kroger Card). As an initial buyer incentive plan, the supermarket provided coupons for the organic products to all of the loyalty program participants and collected data that includes whether these customers purchased any of the organics products.
The target variable is TargetBuy which indicates if an organics purchase was made or not.
data$ID<-as.factor(data$ID)
data<-dummy_columns(data, select_columns = c("DemClusterGroup","DemGender", "DemReg", "DemTVReg", "PromClass"), remove_first_dummy = TRUE, remove_selected_columns = TRUE)
data.summary(data)
## mean sd min max median length
## DemAffl 8.774307e+00 3.4787618 0.00 34.00 8 3000
## DemAge 5.346187e+01 13.3625093 18.00 79.00 53 3000
## PromSpend 4.444894e+03 7176.3483976 0.01 81551.03 2000 3000
## PromTime 6.565980e+00 4.6870982 0.00 39.00 5 3000
## TargetBuy 2.556667e-01 0.4363080 0.00 1.00 0 3000
## TargetAmt 3.080000e-01 0.5790058 0.00 3.00 0 3000
## DemClusterGroup_B 1.813333e-01 0.3853583 0.00 1.00 0 3000
## DemClusterGroup_C 2.043333e-01 0.4032808 0.00 1.00 0 3000
## DemClusterGroup_D 1.883333e-01 0.3910433 0.00 1.00 0 3000
## DemClusterGroup_E 1.266667e-01 0.3326546 0.00 1.00 0 3000
## DemClusterGroup_F 1.790000e-01 0.3834162 0.00 1.00 0 3000
## DemClusterGroup_U 3.500000e-02 0.1838104 0.00 1.00 0 3000
## DemGender_M 2.670000e-01 0.4424661 0.00 1.00 0 3000
## DemGender_U 1.803333e-01 0.3845289 0.00 1.00 0 3000
## DemReg_North 1.980000e-01 0.3985586 0.00 1.00 0 3000
## DemReg_Scottish 6.400000e-02 0.2447937 0.00 1.00 0 3000
## DemReg_South East 3.910000e-01 0.4880557 0.00 1.00 0 3000
## DemReg_South West 2.866667e-02 0.1668957 0.00 1.00 0 3000
## DemReg_U 1.700000e-02 0.1292926 0.00 1.00 0 3000
## DemTVReg_C Scotland 3.800000e-02 0.1912281 0.00 1.00 0 3000
## DemTVReg_East 7.766667e-02 0.2676909 0.00 1.00 0 3000
## DemTVReg_London 2.846667e-01 0.4513308 0.00 1.00 0 3000
## DemTVReg_Midlands 1.316667e-01 0.3381844 0.00 1.00 0 3000
## DemTVReg_N East 3.933333e-02 0.1944192 0.00 1.00 0 3000
## DemTVReg_N Scot 1.933333e-02 0.1377167 0.00 1.00 0 3000
## DemTVReg_N West 9.233333e-02 0.2895442 0.00 1.00 0 3000
## DemTVReg_S & S East 1.063333e-01 0.3083152 0.00 1.00 0 3000
## DemTVReg_S West 2.866667e-02 0.1668957 0.00 1.00 0 3000
## DemTVReg_U 1.700000e-02 0.1292926 0.00 1.00 0 3000
## DemTVReg_Ulster 1.433333e-02 0.1188806 0.00 1.00 0 3000
## DemTVReg_Wales & West 7.766667e-02 0.2676909 0.00 1.00 0 3000
## DemTVReg_Yorkshire 6.633333e-02 0.2489054 0.00 1.00 0 3000
## PromClass_Platinum 3.766667e-02 0.1904205 0.00 1.00 0 3000
## PromClass_Silver 3.836667e-01 0.4863593 0.00 1.00 0 3000
## PromClass_Tin 2.976667e-01 0.4573084 0.00 1.00 0 3000
## missing
## DemAffl 151
## DemAge 194
## PromSpend 0
## PromTime 37
## TargetBuy 0
## TargetAmt 0
## DemClusterGroup_B 0
## DemClusterGroup_C 0
## DemClusterGroup_D 0
## DemClusterGroup_E 0
## DemClusterGroup_F 0
## DemClusterGroup_U 0
## DemGender_M 0
## DemGender_U 0
## DemReg_North 0
## DemReg_Scottish 0
## DemReg_South East 0
## DemReg_South West 0
## DemReg_U 0
## DemTVReg_C Scotland 0
## DemTVReg_East 0
## DemTVReg_London 0
## DemTVReg_Midlands 0
## DemTVReg_N East 0
## DemTVReg_N Scot 0
## DemTVReg_N West 0
## DemTVReg_S & S East 0
## DemTVReg_S West 0
## DemTVReg_U 0
## DemTVReg_Ulster 0
## DemTVReg_Wales & West 0
## DemTVReg_Yorkshire 0
## PromClass_Platinum 0
## PromClass_Silver 0
## PromClass_Tin 0
## levels mode missing
## ID 3000 140 0
data$TargetBuy<-as.factor(data$TargetBuy)
data$TargetBuy<-recode_factor(data$TargetBuy, "1"="Yes", "0"="No")
prop.table(table(data$TargetBuy))
##
## Yes No
## 0.2556667 0.7443333
set.seed(13)
trainIndex<-createDataPartition(data$TargetBuy, p=0.8, list=FALSE, times=1)
training<-data[trainIndex,]
valid<-data[-trainIndex,]
ctrl<-trainControl(method="none", summaryFunction = twoClassSummary, classProbs = TRUE, savePredictions = TRUE)
lrfull<-train(TargetBuy~., data=training, method="glm", family="binomial", metric="ROC", trControl=ctrl)
lrstep<-train(TargetBuy~., data=training, method="glmStepAIC", direction="both", metric="ROC", trControl=ctrl, trace=0)
options(scipen = 999)
summary(lrstep)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2318 -0.3340 0.4029 0.6769 2.4404
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.630534 0.321636 -1.960 0.04995
## DemAffl -0.247788 0.018036 -13.739 < 0.0000000000000002
## DemAge 0.061123 0.004587 13.326 < 0.0000000000000002
## DemClusterGroup_D -0.259007 0.141890 -1.825 0.06794
## DemClusterGroup_F -0.221322 0.144724 -1.529 0.12620
## DemGender_M 1.039327 0.135854 7.650 0.00000000000002
## DemGender_U 1.715473 0.192255 8.923 < 0.0000000000000002
## DemReg_North 0.572612 0.196383 2.916 0.00355
## DemReg_Scottish 1.137944 0.422507 2.693 0.00707
## `\\`DemReg_South East\\`` 0.436011 0.174594 2.497 0.01251
## `\\`DemTVReg_C Scotland\\`` -0.780339 0.488511 -1.597 0.11018
## DemTVReg_East 0.664367 0.256088 2.594 0.00948
## DemTVReg_Midlands 0.635626 0.214439 2.964 0.00304
## DemTVReg_Ulster 0.696712 0.491658 1.417 0.15646
##
## (Intercept) *
## DemAffl ***
## DemAge ***
## DemClusterGroup_D .
## DemClusterGroup_F
## DemGender_M ***
## DemGender_U ***
## DemReg_North **
## DemReg_Scottish **
## `\\`DemReg_South East\\`` *
## `\\`DemTVReg_C Scotland\\``
## DemTVReg_East **
## DemTVReg_Midlands **
## DemTVReg_Ulster
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2730.1 on 2400 degrees of freedom
## Residual deviance: 2068.3 on 2387 degrees of freedom
## AIC: 2096.3
##
## Number of Fisher Scoring iterations: 5
class.tree.ou<-rpart(TargetBuy~., data=training, control=rpart.control(maxdepth = 30, minsplit=20, cp=0.001, xval=10), method="class", model=TRUE)
printcp(class.tree.ou)
##
## Classification tree:
## rpart(formula = TargetBuy ~ ., data = training, method = "class",
## model = TRUE, control = rpart.control(maxdepth = 30, minsplit = 20,
## cp = 0.001, xval = 10))
##
## Variables actually used in tree construction:
## [1] DemAffl DemAge DemClusterGroup_B
## [4] DemClusterGroup_C DemGender_M DemGender_U
## [7] DemTVReg_London DemTVReg_Midlands DemTVReg_Wales & West
## [10] PromSpend PromTime
##
## Root node error: 614/2401 = 0.25573
##
## n= 2401
##
## CP nsplit rel error xerror xstd
## 1 0.0952769 0 1.00000 1.00000 0.034816
## 2 0.0211726 2 0.80945 0.81107 0.032357
## 3 0.0203583 4 0.76710 0.77850 0.031867
## 4 0.0130293 7 0.68893 0.74430 0.031329
## 5 0.0048860 8 0.67590 0.69544 0.030516
## 6 0.0043431 9 0.67101 0.71173 0.030793
## 7 0.0032573 12 0.65798 0.72150 0.030956
## 8 0.0024430 19 0.63518 0.72638 0.031037
## 9 0.0016287 23 0.62541 0.73290 0.031144
## 10 0.0012215 25 0.62215 0.74104 0.031276
## 11 0.0010858 34 0.60912 0.77850 0.031867
## 12 0.0010000 37 0.60586 0.77850 0.031867
prp(class.tree.ou, type=1, extra=1, split.font = 1, varlen = -10, digits = -3)
class.tree.mu<-prune(class.tree.ou, cp=class.tree.ou$cptable[which.min(class.tree.ou$cptable[,"xerror"]), "CP"])
printcp(class.tree.mu)
##
## Classification tree:
## rpart(formula = TargetBuy ~ ., data = training, method = "class",
## model = TRUE, control = rpart.control(maxdepth = 30, minsplit = 20,
## cp = 0.001, xval = 10))
##
## Variables actually used in tree construction:
## [1] DemAffl DemAge DemClusterGroup_B DemGender_M
## [5] DemGender_U
##
## Root node error: 614/2401 = 0.25573
##
## n= 2401
##
## CP nsplit rel error xerror xstd
## 1 0.095277 0 1.00000 1.00000 0.034816
## 2 0.021173 2 0.80945 0.81107 0.032357
## 3 0.020358 4 0.76710 0.77850 0.031867
## 4 0.013029 7 0.68893 0.74430 0.031329
## 5 0.004886 8 0.67590 0.69544 0.030516
prp(class.tree.mu, type=1, extra=1, split.font = 1, varlen = -10, digits = -3)
Predict this observation (this is the same thing shown in your exam).
set.seed(13)
cvindx<-createFolds(trainIndex, k=10, returnTrain = TRUE)
ctrl <- trainControl(method="cv", index=cvindx, summaryFunction = twoClassSummary, classProbs = TRUE)
tunegrid <- expand.grid(
.mtry = c(2, 5, 10, 15),
.splitrule = "gini",
.min.node.size = c(200,250,300, 350)
)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
rforest<-train(TargetBuy~., data=training, method="ranger", tuneGrid=tunegrid, metric="ROC",
num.trees=500, importance="impurity", trControl=ctrl )
stopCluster(cl)
rforest
## Random Forest
##
## 2401 samples
## 33 predictor
## 2 classes: 'Yes', 'No'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2161, 2161, 2161, 2161, 2161, 2161, ...
## Resampling results across tuning parameters:
##
## mtry min.node.size ROC Sens Spec
## 2 200 0.7969406 0.033696041 0.9983176
## 2 250 0.7967645 0.019225416 0.9983176
## 2 300 0.7982899 0.007780217 0.9988732
## 2 350 0.7962571 0.000000000 1.0000000
## 5 200 0.8087844 0.277059656 0.9753118
## 5 250 0.8072548 0.245778560 0.9808688
## 5 300 0.8095142 0.224082294 0.9842611
## 5 350 0.8086102 0.197725128 0.9898877
## 10 200 0.8106792 0.435663260 0.9434860
## 10 250 0.8109584 0.427537038 0.9450925
## 10 300 0.8126413 0.399216913 0.9501683
## 10 350 0.8132080 0.381359958 0.9535594
## 15 200 0.8120443 0.471615526 0.9305568
## 15 250 0.8124344 0.458777433 0.9322623
## 15 300 0.8125042 0.433734757 0.9372375
## 15 350 0.8136489 0.432569593 0.9405111
##
## Tuning parameter 'splitrule' was held constant at a value of gini
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 15, splitrule = gini
## and min.node.size = 350.
plot(rforest)
varImp(rforest)
## ranger variable importance
##
## only 20 most important variables shown (out of 33)
##
## Overall
## DemAge 100.0000
## DemAffl 67.0975
## DemGender_U 16.2166
## DemGender_M 9.7908
## PromSpend 4.5109
## PromTime 2.4718
## `DemTVReg_Wales & West` 0.6413
## `DemTVReg_S West` 0.4775
## `DemReg_South West` 0.4161
## DemClusterGroup_F 0.3980
## DemClusterGroup_D 0.3369
## DemClusterGroup_B 0.3227
## PromClass_Tin 0.2737
## `DemTVReg_C Scotland` 0.2237
## DemClusterGroup_C 0.1826
## `DemTVReg_N West` 0.1631
## `DemTVReg_S & S East` 0.1545
## DemReg_North 0.1219
## DemTVReg_U 0.1171
## DemClusterGroup_E 0.1135
set.seed(13)
tunegrid <- expand.grid( n.trees = seq(50,350,50), interaction.depth = c(10, 20), shrinkage = c(0.1, 0.01), n.minobsinnode=c(25))
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
gb.tree <- train(TargetBuy~., data=training, method = 'gbm', trControl=ctrl, tuneGrid=tunegrid, metric='ROC')
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1314 nan 0.0100 0.0026
## 2 1.1261 nan 0.0100 0.0025
## 3 1.1210 nan 0.0100 0.0025
## 4 1.1161 nan 0.0100 0.0025
## 5 1.1113 nan 0.0100 0.0023
## 6 1.1064 nan 0.0100 0.0024
## 7 1.1012 nan 0.0100 0.0023
## 8 1.0960 nan 0.0100 0.0023
## 9 1.0916 nan 0.0100 0.0019
## 10 1.0872 nan 0.0100 0.0022
## 20 1.0469 nan 0.0100 0.0019
## 40 0.9879 nan 0.0100 0.0012
## 60 0.9448 nan 0.0100 0.0007
## 80 0.9132 nan 0.0100 0.0005
## 100 0.8889 nan 0.0100 0.0004
## 120 0.8702 nan 0.0100 0.0002
## 140 0.8548 nan 0.0100 0.0001
## 150 0.8481 nan 0.0100 0.0002
stopCluster(cl)
gb.tree
## Stochastic Gradient Boosting
##
## 2401 samples
## 33 predictor
## 2 classes: 'Yes', 'No'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2161, 2161, 2161, 2161, 2161, 2161, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees ROC Sens Spec
## 0.01 10 50 0.8175928 0.008018313 0.9988795
## 0.01 10 100 0.8188771 0.299782502 0.9731015
## 0.01 10 150 0.8194540 0.388241877 0.9529198
## 0.01 10 200 0.8192479 0.434590725 0.9444266
## 0.01 10 250 0.8189846 0.457324060 0.9411345
## 0.01 10 300 0.8179288 0.465725309 0.9395232
## 0.01 10 350 0.8174871 0.468828901 0.9390446
## 0.01 20 50 0.8127154 0.048799633 0.9972294
## 0.01 20 100 0.8137111 0.293580764 0.9702952
## 0.01 20 150 0.8145173 0.392801281 0.9522775
## 0.01 20 200 0.8132307 0.435679151 0.9427691
## 0.01 20 250 0.8133516 0.453194337 0.9389535
## 0.01 20 300 0.8120375 0.463462412 0.9367696
## 0.01 20 350 0.8108462 0.470662166 0.9362492
## 0.10 10 50 0.8100936 0.482113361 0.9339991
## 0.10 10 100 0.8000114 0.469624716 0.9272212
## 0.10 10 150 0.7955282 0.470740957 0.9172997
## 0.10 10 200 0.7918043 0.472215388 0.9095674
## 0.10 10 250 0.7837146 0.468113206 0.9089614
## 0.10 10 300 0.7816476 0.468255058 0.9032955
## 0.10 10 350 0.7772323 0.469358650 0.9040316
## 0.10 20 50 0.8012344 0.464128990 0.9239922
## 0.10 20 100 0.7910367 0.478840700 0.9105516
## 0.10 20 150 0.7829584 0.481327692 0.9039522
## 0.10 20 200 0.7799780 0.477947126 0.8972636
## 0.10 20 250 0.7782941 0.486487732 0.8949487
## 0.10 20 300 0.7759009 0.488122952 0.8829170
## 0.10 20 350 0.7753893 0.488053545 0.8816929
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 25
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 10, shrinkage = 0.01 and n.minobsinnode = 25.
numWts<-500
tunegrid<-expand.grid( .size=4, .decay= c(0.1, 0.5))
x<-select(training, -TargetBuy)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
nnetFit.ou<-train(x, y=training$TargetBuy,
method="nnet",
metric="ROC",
linout=FALSE,
preProcess = c("range"),
tuneGrid = tunegrid,
trace=FALSE,
maxit=200,
MaxNWts=numWts,
trControl=ctrl)
stopCluster(cl)
numWts<-500
tunegrid<-expand.grid( .size=1:10, .decay= c(0.1, 0.5))
terms<-names(lrstep$finalModel$coefficients)[-1]
##From the terms
x<-select(training,"DemAffl", "DemAge","PromTime", "DemClusterGroup_D", "DemGender_M" ,"DemGender_U" , "PromClass_Tin", "DemTVReg_Yorkshire")
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
nnetFit.mu<-train(x, y=training$TargetBuy,
method="nnet",
metric="ROC",
linout=FALSE,
preProcess = c("range"),
tuneGrid = tunegrid,
trace=FALSE,
maxit=200,
MaxNWts=numWts,
trControl=ctrl)
stopCluster(cl)
nnetFit.mu
## Neural Network
##
## 2401 samples
## 8 predictor
## 2 classes: 'Yes', 'No'
##
## Pre-processing: re-scaling to [0, 1] (8)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2161, 2161, 2161, 2161, 2161, 2161, ...
## Resampling results across tuning parameters:
##
## size decay ROC Sens Spec
## 1 0.1 0.8112700 0.4416529 0.9344226
## 1 0.5 0.8111686 0.4232376 0.9450799
## 2 0.1 0.8156336 0.4373272 0.9428275
## 2 0.5 0.8135827 0.4202489 0.9506800
## 3 0.1 0.8153818 0.4399508 0.9395891
## 3 0.5 0.8135786 0.4186096 0.9501335
## 4 0.1 0.8125666 0.4492085 0.9429884
## 4 0.5 0.8135252 0.4139792 0.9507218
## 5 0.1 0.8111561 0.4474219 0.9406074
## 5 0.5 0.8137573 0.4156185 0.9507218
## 6 0.1 0.8107045 0.4521594 0.9401323
## 6 0.5 0.8138450 0.4156185 0.9507218
## 7 0.1 0.8103871 0.4510175 0.9418438
## 7 0.5 0.8137632 0.4156185 0.9501335
## 8 0.1 0.8101365 0.4508425 0.9412638
## 8 0.5 0.8136752 0.4156185 0.9501335
## 9 0.1 0.8100122 0.4531133 0.9413644
## 9 0.5 0.8137748 0.4156185 0.9501335
## 10 0.1 0.8100317 0.4544981 0.9413612
## 10 0.5 0.8137838 0.4156185 0.9501335
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were size = 2 and decay = 0.1.