A supermarket is offering a new line of organic products. The supermarket’s management wants to determine which customers are likely to purchase these products. The supermarket has a loyalty program (think Kroger Card). As an initial buyer incentive plan, the supermarket provided coupons for the organic products to all of the loyalty program participants and collected data that includes whether these customers purchased any of the organics products.
The target variable is TargetBuy which indicates if an organics purchase was made or not.
pacman::p_load(caret, rpart, tidyverse, nnet, ranger, doParallel, tidyverse, gbm, xgboost, DataExplorer, fastDummies, ROCR, pROC, rpart.plot, skimr, tidyverse)
df<-read_csv("I:\\Classes\\ISA 491 Data Mining\\Exams\\Spring 2022\\Final Exam\\organics.csv")
df$ID<-as.factor(df$ID)
df$TargetBuy<-as.factor(df$TargetBuy)
df$TargetBuy<-recode_factor(df$TargetBuy, "0"="No", "1"="Yes" )
prop.table(table(df$TargetBuy))
##
## No Yes
## 0.7522837 0.2477163
df<-dummy_columns(df, select_columns = c("DemClusterGroup", "DemGender", "DemReg", "DemTVReg", "PromClass"), remove_first_dummy = TRUE, remove_selected_columns = TRUE)
plot_bar(df)
plot_histogram(df)
plot_missing(df)
df$DemAffl[is.na(df$DemAffl)]<-median(df$DemAffl, na.rm=TRUE)
df$DemAge[is.na(df$DemAge )]<-median(df$DemAge , na.rm=TRUE)
df$PromTime[is.na(df$PromTime )]<-median(df$PromTime , na.rm=TRUE)
plot_missing(df)
set.seed(13)
trainIndex<-createDataPartition(df$TargetBuy, p=0.8, list=FALSE, times=1)
train<-df[trainIndex,]
valid<-df[-trainIndex,]
prop.table(table(train$TargetBuy))
##
## No Yes
## 0.752292 0.247708
prop.table(table(valid$TargetBuy))
##
## No Yes
## 0.7522502 0.2477498
table(train$TargetBuy)
##
## No Yes
## 13375 4404
class.tree<-rpart(TargetBuy~., data=training, control=rpart.control(maxdepth = 4, minsplit=20, cp=0.002, xval=10), method="class", model=TRUE)
small.tree<-prune(class.tree, cp=class.tree$cptable[which.min(class.tree$cptable[,"xerror"]), "CP"])
prp(class.tree, type=1, extra=1, split.font = 1, varlen = -10, digits = -3)
set.seed(13)
index<-seq(1:nrow(training))
cvindx<-caret::createFolds(index, k=10, returnTrain = TRUE)
ctrl <- caret::trainControl(method="cv", index=cvindx, summaryFunction = twoClassSummary, classProbs = TRUE)
tuneGrid = expand.grid(
alpha = 1, # LASSO penalty
lambda = seq(0.0001, 1, length = 100)
)
lasso_model <- train(
TargetBuy~ .,
data = training,
method = "glmnet",
trControl = ctrl,
metric = "ROC",
tuneGrid=tuneGrid,
family = "binomial"
)
tunegrid <- expand.grid(
.mtry = c(5, 10, 15, 20),
.splitrule = "gini",
.min.node.size = c(10, 20)
)
tunegrid
## .mtry .splitrule .min.node.size
## 1 5 gini 10
## 2 10 gini 10
## 3 15 gini 10
## 4 20 gini 10
## 5 5 gini 20
## 6 10 gini 20
## 7 15 gini 20
## 8 20 gini 20
cl <- makePSOCKcluster(7)
registerDoParallel(cl)
rforest<-train(TargetBuy~., data=training, method="ranger", tuneGrid=tunegrid, metric="ROC",
num.trees=450, importance="impurity", trControl=ctrl )
stopCluster(cl)
Grid <- expand.grid( n.trees = c(50,100), interaction.depth = c(10, 20), shrinkage = c(0.1, 0.01), n.minobsinnode=c(25))
Grid
## n.trees interaction.depth shrinkage n.minobsinnode
## 1 50 10 0.10 25
## 2 100 10 0.10 25
## 3 50 20 0.10 25
## 4 100 20 0.10 25
## 5 50 10 0.01 25
## 6 100 10 0.01 25
## 7 50 20 0.01 25
## 8 100 20 0.01 25
cl <- makePSOCKcluster(7)
registerDoParallel(cl)
gb.tree <- train(TargetBuy~.,
data=training,
method = 'gbm',
trControl=ctrl,
tuneGrid=Grid,
metric='ROC')
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3371 nan 0.1000 0.0248
## 2 1.2958 nan 0.1000 0.0199
## 3 1.2615 nan 0.1000 0.0173
## 4 1.2327 nan 0.1000 0.0144
## 5 1.2083 nan 0.1000 0.0117
## 6 1.1859 nan 0.1000 0.0105
## 7 1.1679 nan 0.1000 0.0081
## 8 1.1510 nan 0.1000 0.0077
## 9 1.1368 nan 0.1000 0.0065
## 10 1.1241 nan 0.1000 0.0062
## 20 1.0543 nan 0.1000 0.0012
## 40 1.0141 nan 0.1000 -0.0003
## 50 1.0022 nan 0.1000 -0.0004
stopCluster(cl)
gb.tree
## Stochastic Gradient Boosting
##
## 8808 samples
## 31 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 7927, 7928, 7925, 7928, 7927, 7928, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees ROC Sens Spec
## 0.01 10 50 0.8079458 0.7579599 0.7084514
## 0.01 10 100 0.8131453 0.7720359 0.7027773
## 0.01 20 50 0.8158259 0.7865675 0.6948320
## 0.01 20 100 0.8167942 0.7813417 0.6973289
## 0.10 10 50 0.8186098 0.7659006 0.7070954
## 0.10 10 100 0.8160595 0.7590937 0.7125464
## 0.10 20 50 0.8156714 0.7615865 0.7089110
## 0.10 20 100 0.8128358 0.7545493 0.7118635
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 25
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 50, interaction.depth =
## 10, shrinkage = 0.1 and n.minobsinnode = 25.
Terms from the lasso model.
glmnet_fit <- lasso_model$finalModel
best_lambda <- lasso_model$bestTune$lambda
coef_mat <- coef(glmnet_fit, s = best_lambda)
coef_df <- data.frame(
term = rownames(as.matrix(coef_mat)),
estimate = as.numeric(as.matrix(coef_mat)),
stringsAsFactors = FALSE
)
active_vars <- coef_df %>%
filter(term != "(Intercept)", estimate != 0)
terms.lasso <- active_vars$term
terms.lasso
## [1] "DemAffl" "DemAge" "DemGender_M"
## [4] "DemGender_U" "DemGender_NA" "DemTVReg_Yorkshire"
## [7] "PromClass_Platinum"
tunegrid<-expand.grid( .size=5:10, .decay= c(0.01, 0.1))
numWts<-500
cl <- makePSOCKcluster(7)
registerDoParallel(cl)
nnetFit<-train(x=training[,terms.lasso], y=training$TargetBuy,
method="nnet",
metric="ROC",
linout=FALSE,
preProcess = c("range"),
tuneGrid = tunegrid,
trace=FALSE,
maxit=100,
MaxNWts=numWts,
trControl=ctrl)
stopCluster(cl)
nnetFit
## Neural Network
##
## 8808 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: re-scaling to [0, 1] (7)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 7927, 7928, 7925, 7928, 7927, 7928, ...
## Resampling results across tuning parameters:
##
## size decay ROC Sens Spec
## 5 0.01 0.8140407 0.7659052 0.7009622
## 5 0.10 0.8130660 0.7615809 0.7025484
## 6 0.01 0.8140055 0.7702177 0.6986900
## 6 0.10 0.8128765 0.7581785 0.7052773
## 7 0.01 0.8142962 0.7715824 0.6998258
## 7 0.10 0.8129711 0.7629486 0.7018692
## 8 0.01 0.8140276 0.7704471 0.6998258
## 8 0.10 0.8129663 0.7629471 0.7011894
## 9 0.01 0.8133531 0.7670375 0.7023232
## 9 0.10 0.8131242 0.7618123 0.7025541
## 10 0.01 0.8139703 0.7715768 0.6982416
## 10 0.10 0.8133046 0.7627229 0.7018687
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were size = 7 and decay = 0.01.
| Model AUC (Training & Validation) | ||
| Training AUC | Validation AUC | |
|---|---|---|
| GB Tree | 0.835 | 0.835 |
| NN | 0.819 | 0.829 |
| Random Forest | 0.931 | 0.828 |
| Lasso | 0.802 | 0.810 |
pred<-prediction(p.gbtree[,2], valid$TargetBuy)
lift<-performance(pred, "lift", "rpp")
lift.results<-data.frame( Cutoffs=unlist(pred@cutoffs), True_Positive=unlist(pred@tp), Counts=unlist(pred@n.pos.pred) )
lift.results[100,]
## Cutoffs True_Positive Counts
## 100 0.9142479 102 111
r<-roc(valid$TargetBuy, p.gbtree[,2])
coords(r,"best", ret="threshold")
## threshold
## 1 0.4602883
eval$Cut<-as.factor(ifelse(eval$Boosted.tree > 0.4821911, "Yes", "No"))
confusionMatrix(data=eval$Cut, reference=eval$Target, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2596 282
## Yes 747 819
##
## Accuracy : 0.7685
## 95% CI : (0.7558, 0.7808)
## No Information Rate : 0.7523
## P-Value [Acc > NIR] : 0.006207
##
## Kappa : 0.4559
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7439
## Specificity : 0.7765
## Pos Pred Value : 0.5230
## Neg Pred Value : 0.9020
## Prevalence : 0.2477
## Detection Rate : 0.1843
## Detection Prevalence : 0.3524
## Balanced Accuracy : 0.7602
##
## 'Positive' Class : Yes
##