A supermarket is offering a new line of organic products. The supermarket’s management wants to determine which customers are likely to purchase these products. The supermarket has a loyalty program (think Kroger Card). As an initial buyer incentive plan, the supermarket provided coupons for the organic products to all of the loyalty program participants and collected data that includes whether these customers purchased any of the organics products.

The target variable is TargetBuy which indicates if an organics purchase was made or not.

Read in Data

pacman::p_load(caret, rpart, tidyverse, nnet, ranger, doParallel, tidyverse, gbm, xgboost, DataExplorer, fastDummies, ROCR, pROC, rpart.plot, skimr, tidyverse)
df<-read_csv("I:\\Classes\\ISA 491 Data Mining\\Exams\\Spring 2022\\Final Exam\\organics.csv")

Recode Response Variable

df$ID<-as.factor(df$ID)
df$TargetBuy<-as.factor(df$TargetBuy)
df$TargetBuy<-recode_factor(df$TargetBuy, "0"="No", "1"="Yes" )
prop.table(table(df$TargetBuy))
## 
##        No       Yes 
## 0.7522837 0.2477163

Create Dummy Varialbes

df<-dummy_columns(df, select_columns = c("DemClusterGroup", "DemGender", "DemReg", "DemTVReg", "PromClass"), remove_first_dummy = TRUE, remove_selected_columns = TRUE)

Data Exploration

plot_bar(df)

plot_histogram(df)

plot_missing(df)

Imputation

df$DemAffl[is.na(df$DemAffl)]<-median(df$DemAffl, na.rm=TRUE)
df$DemAge[is.na(df$DemAge )]<-median(df$DemAge , na.rm=TRUE)
df$PromTime[is.na(df$PromTime  )]<-median(df$PromTime  , na.rm=TRUE)
plot_missing(df)

Data Partition

set.seed(13)
trainIndex<-createDataPartition(df$TargetBuy, p=0.8, list=FALSE, times=1)
train<-df[trainIndex,]
valid<-df[-trainIndex,]
prop.table(table(train$TargetBuy))
## 
##       No      Yes 
## 0.752292 0.247708
prop.table(table(valid$TargetBuy))
## 
##        No       Yes 
## 0.7522502 0.2477498

Create Training Data

table(train$TargetBuy)
## 
##    No   Yes 
## 13375  4404

D.T. Model

class.tree<-rpart(TargetBuy~., data=training, control=rpart.control(maxdepth = 4, minsplit=20, cp=0.002, xval=10), method="class", model=TRUE)
small.tree<-prune(class.tree, cp=class.tree$cptable[which.min(class.tree$cptable[,"xerror"]), "CP"])

prp(class.tree, type=1, extra=1, split.font = 1, varlen = -10, digits = -3)

Set up CV for Modeling

set.seed(13)
index<-seq(1:nrow(training))
cvindx<-caret::createFolds(index, k=10, returnTrain = TRUE)
ctrl <- caret::trainControl(method="cv", index=cvindx, summaryFunction = twoClassSummary, classProbs = TRUE)

Logistic Regression

tuneGrid = expand.grid(
    alpha = 1,              # LASSO penalty
    lambda = seq(0.0001, 1, length = 100)
  )
lasso_model <- train(
 TargetBuy~ .,
  data = training,
  method = "glmnet",
  trControl = ctrl,
  metric = "ROC",          
  tuneGrid=tuneGrid,
  family = "binomial"
)

Random Forest

tunegrid <- expand.grid(
  .mtry = c(5, 10, 15, 20),
  .splitrule = "gini",
  .min.node.size = c(10, 20)
)
tunegrid
##   .mtry .splitrule .min.node.size
## 1     5       gini             10
## 2    10       gini             10
## 3    15       gini             10
## 4    20       gini             10
## 5     5       gini             20
## 6    10       gini             20
## 7    15       gini             20
## 8    20       gini             20
cl <- makePSOCKcluster(7) 
registerDoParallel(cl)


rforest<-train(TargetBuy~., data=training, method="ranger", tuneGrid=tunegrid, metric="ROC", 
               num.trees=450, importance="impurity", trControl=ctrl )

stopCluster(cl)

Boosted Tree

Grid <- expand.grid( n.trees = c(50,100), interaction.depth = c(10, 20), shrinkage = c(0.1, 0.01), n.minobsinnode=c(25))
Grid
##   n.trees interaction.depth shrinkage n.minobsinnode
## 1      50                10      0.10             25
## 2     100                10      0.10             25
## 3      50                20      0.10             25
## 4     100                20      0.10             25
## 5      50                10      0.01             25
## 6     100                10      0.01             25
## 7      50                20      0.01             25
## 8     100                20      0.01             25
cl <- makePSOCKcluster(7) 
registerDoParallel(cl)

gb.tree <- train(TargetBuy~., 
                 data=training, 
                 method = 'gbm', 
                 trControl=ctrl, 
                 tuneGrid=Grid, 
                 metric='ROC')
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.3371             nan     0.1000    0.0248
##      2        1.2958             nan     0.1000    0.0199
##      3        1.2615             nan     0.1000    0.0173
##      4        1.2327             nan     0.1000    0.0144
##      5        1.2083             nan     0.1000    0.0117
##      6        1.1859             nan     0.1000    0.0105
##      7        1.1679             nan     0.1000    0.0081
##      8        1.1510             nan     0.1000    0.0077
##      9        1.1368             nan     0.1000    0.0065
##     10        1.1241             nan     0.1000    0.0062
##     20        1.0543             nan     0.1000    0.0012
##     40        1.0141             nan     0.1000   -0.0003
##     50        1.0022             nan     0.1000   -0.0004
stopCluster(cl)

gb.tree
## Stochastic Gradient Boosting 
## 
## 8808 samples
##   31 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 7927, 7928, 7925, 7928, 7927, 7928, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  ROC        Sens       Spec     
##   0.01       10                  50      0.8079458  0.7579599  0.7084514
##   0.01       10                 100      0.8131453  0.7720359  0.7027773
##   0.01       20                  50      0.8158259  0.7865675  0.6948320
##   0.01       20                 100      0.8167942  0.7813417  0.6973289
##   0.10       10                  50      0.8186098  0.7659006  0.7070954
##   0.10       10                 100      0.8160595  0.7590937  0.7125464
##   0.10       20                  50      0.8156714  0.7615865  0.7089110
##   0.10       20                 100      0.8128358  0.7545493  0.7118635
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 25
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 50, interaction.depth =
##  10, shrinkage = 0.1 and n.minobsinnode = 25.

Neural Network

Terms from the lasso model.

glmnet_fit  <- lasso_model$finalModel
best_lambda <- lasso_model$bestTune$lambda
coef_mat <- coef(glmnet_fit, s = best_lambda)

coef_df <- data.frame(
  term = rownames(as.matrix(coef_mat)),
  estimate = as.numeric(as.matrix(coef_mat)),
  stringsAsFactors = FALSE
)

active_vars <- coef_df %>%
  filter(term != "(Intercept)", estimate != 0)

terms.lasso <- active_vars$term
terms.lasso
## [1] "DemAffl"            "DemAge"             "DemGender_M"       
## [4] "DemGender_U"        "DemGender_NA"       "DemTVReg_Yorkshire"
## [7] "PromClass_Platinum"
tunegrid<-expand.grid( .size=5:10, .decay= c(0.01, 0.1))
numWts<-500
cl <- makePSOCKcluster(7)
registerDoParallel(cl)

nnetFit<-train(x=training[,terms.lasso], y=training$TargetBuy, 
               method="nnet", 
               metric="ROC", 
               linout=FALSE,
               preProcess = c("range"), 
               tuneGrid = tunegrid, 
              
               trace=FALSE,
               maxit=100,
               MaxNWts=numWts,
               trControl=ctrl)

stopCluster(cl)
nnetFit
## Neural Network 
## 
## 8808 samples
##    7 predictor
##    2 classes: 'No', 'Yes' 
## 
## Pre-processing: re-scaling to [0, 1] (7) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 7927, 7928, 7925, 7928, 7927, 7928, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  ROC        Sens       Spec     
##    5    0.01   0.8140407  0.7659052  0.7009622
##    5    0.10   0.8130660  0.7615809  0.7025484
##    6    0.01   0.8140055  0.7702177  0.6986900
##    6    0.10   0.8128765  0.7581785  0.7052773
##    7    0.01   0.8142962  0.7715824  0.6998258
##    7    0.10   0.8129711  0.7629486  0.7018692
##    8    0.01   0.8140276  0.7704471  0.6998258
##    8    0.10   0.8129663  0.7629471  0.7011894
##    9    0.01   0.8133531  0.7670375  0.7023232
##    9    0.10   0.8131242  0.7618123  0.7025541
##   10    0.01   0.8139703  0.7715768  0.6982416
##   10    0.10   0.8133046  0.7627229  0.7018687
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were size = 7 and decay = 0.01.

Model Comparison

Model AUC (Training & Validation)
Training AUC Validation AUC
GB Tree 0.835 0.835
NN 0.819 0.829
Random Forest 0.931 0.828
Lasso 0.802 0.810

pred<-prediction(p.gbtree[,2], valid$TargetBuy)
lift<-performance(pred, "lift", "rpp")
lift.results<-data.frame( Cutoffs=unlist(pred@cutoffs), True_Positive=unlist(pred@tp), Counts=unlist(pred@n.pos.pred) )
lift.results[100,]
##       Cutoffs True_Positive Counts
## 100 0.9142479           102    111

Model Justification

r<-roc(valid$TargetBuy,  p.gbtree[,2])
coords(r,"best", ret="threshold")
##   threshold
## 1 0.4602883
eval$Cut<-as.factor(ifelse(eval$Boosted.tree > 0.4821911, "Yes", "No"))
confusionMatrix(data=eval$Cut, reference=eval$Target, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2596  282
##        Yes  747  819
##                                           
##                Accuracy : 0.7685          
##                  95% CI : (0.7558, 0.7808)
##     No Information Rate : 0.7523          
##     P-Value [Acc > NIR] : 0.006207        
##                                           
##                   Kappa : 0.4559          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7439          
##             Specificity : 0.7765          
##          Pos Pred Value : 0.5230          
##          Neg Pred Value : 0.9020          
##              Prevalence : 0.2477          
##          Detection Rate : 0.1843          
##    Detection Prevalence : 0.3524          
##       Balanced Accuracy : 0.7602          
##                                           
##        'Positive' Class : Yes             
##