pacman::p_load(tidyverse, DataExplorer, nnet, ranger, gbm, xgboost, caret, pROC, ROCR, rpart, rpart.plot, doParallel, glmnet)

East-West Airlines has entered into a partnership with the wireless phone company Telecon to sell the latter’s service via direct mail. The data used to construct these models is a sample of people who already received the offer. We will construct a model to predict whether they purchase a wireless phone service contract (Phone_Sale=Yes). We will use this model classify additional customers.

Here is a snapshot of the data.

Topflight Balance Qual_miles cc1_miles. cc2_miles. cc3_miles. Bonus_miles Bonus_trans Flight_miles_12mo Flight_trans_12 Online_12 Email Club_member Any_cc_miles_12mo Phone_sale
884 0 135168 0 1 0 0 29156 16 300 2 0 1 0 1 No
3017 1 97424 0 1 0 0 63796 17 0 0 0 1 0 1 No
1583 0 11001 0 0 0 0 255 5 0 0 0 1 0 0 No
4919 0 5010 0 0 0 0 0 0 0 0 0 0 0 0 No
483 1 10954 500 0 0 0 2500 1 0 0 0 1 0 0 No
3605 0 12964 500 0 0 0 450 3 350 2 0 1 0 0 No

Here I describe the response variable.

## 
##        No       Yes 
## 0.8696848 0.1303152

And then create training and validation partitions.

set.seed(13)
trainIndex<-createDataPartition(df$Phone_sale, p=0.6, list=FALSE, times=1)
train<-df[trainIndex, ]
valid<-df[-trainIndex,]

Decision Tree Model OU

Your Eastern “Red Brick” colleagues constructed this model.

class.tree.ou<-rpart(Phone_sale~., data=train, control=rpart.control(maxdepth = 4, minsplit=20, cp=0.002, xval=10), method="class", model=TRUE)
printcp(class.tree.ou)
## 
## Classification tree:
## rpart(formula = Phone_sale ~ ., data = train, method = "class", 
##     model = TRUE, control = rpart.control(maxdepth = 4, minsplit = 20, 
##         cp = 0.002, xval = 10))
## 
## Variables actually used in tree construction:
## [1] Bonus_miles Bonus_trans
## 
## Root node error: 313/2400 = 0.13042
## 
## n= 2400 
## 
##          CP nsplit rel error xerror     xstd
## 1 0.0047923      0   1.00000 1.0000 0.052709
## 2 0.0020000      2   0.99042 1.0128 0.052994
prp(class.tree.ou, type=1, extra=1, split.font = 1, varlen = -10, digits = -3)

Set up Cross Validation

set.seed(13)
cvindx<-createFolds(trainIndex, k=10, returnTrain = TRUE)
ctrl <- trainControl(method="cv", index=cvindx, summaryFunction = twoClassSummary, classProbs = TRUE)

Random Forest

tunegrid <- expand.grid(
     .mtry = c(2, 5, 10),
     .splitrule = c("gini"),
    .min.node.size = c(10, 20, 50)
  )
tunegrid
##   .mtry .splitrule .min.node.size
## 1     2       gini             10
## 2     5       gini             10
## 3    10       gini             10
## 4     2       gini             20
## 5     5       gini             20
## 6    10       gini             20
## 7     2       gini             50
## 8     5       gini             50
## 9    10       gini             50
cl <- makePSOCKcluster(6) 
registerDoParallel(cl)


rforest<-train(Phone_sale~., data=train, method="ranger", tuneGrid=tunegrid, metric="ROC", 
               num.trees=600, importance="impurity", trControl=ctrl )

stopCluster(cl) 

Boosted Tree

set.seed(13)

tuneGrid <- expand.grid( n.trees = seq(50,100,25), interaction.depth = c(10, 20), shrinkage = c(0.1, 0.01), n.minobsinnode=c(25))
tuneGrid
##    n.trees interaction.depth shrinkage n.minobsinnode
## 1       50                10      0.10             25
## 2       75                10      0.10             25
## 3      100                10      0.10             25
## 4       50                20      0.10             25
## 5       75                20      0.10             25
## 6      100                20      0.10             25
## 7       50                10      0.01             25
## 8       75                10      0.01             25
## 9      100                10      0.01             25
## 10      50                20      0.01             25
## 11      75                20      0.01             25
## 12     100                20      0.01             25
cl <- makePSOCKcluster(6) #starts the cluster
registerDoParallel(cl)

gb.tree <- train(Phone_sale~., 
                 data=train, 
                 method = 'gbm', 
                 trControl=ctrl, 
                 tuneGrid=tuneGrid, 
                 metric='ROC')
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.7729             nan     0.0100    0.0004
##      2        0.7713             nan     0.0100    0.0004
##      3        0.7695             nan     0.0100    0.0004
##      4        0.7678             nan     0.0100    0.0005
##      5        0.7661             nan     0.0100    0.0004
##      6        0.7645             nan     0.0100    0.0004
##      7        0.7628             nan     0.0100    0.0002
##      8        0.7613             nan     0.0100    0.0003
##      9        0.7600             nan     0.0100    0.0003
##     10        0.7586             nan     0.0100    0.0003
##     20        0.7454             nan     0.0100    0.0002
##     40        0.7241             nan     0.0100    0.0001
##     60        0.7063             nan     0.0100    0.0000
##     75        0.6950             nan     0.0100    0.0000
stopCluster(cl)

gb.tree
## Stochastic Gradient Boosting 
## 
## 2400 samples
##   14 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2160, 2160, 2160, 2160, 2160, 2160, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  ROC        Sens       Spec       
##   0.01       10                  50      0.6366168  1.0000000  0.000000000
##   0.01       10                  75      0.6375202  1.0000000  0.000000000
##   0.01       10                 100      0.6372970  1.0000000  0.000000000
##   0.01       20                  50      0.6330344  1.0000000  0.000000000
##   0.01       20                  75      0.6376144  1.0000000  0.000000000
##   0.01       20                 100      0.6368114  1.0000000  0.000000000
##   0.10       10                  50      0.6220247  0.9961946  0.009703539
##   0.10       10                  75      0.6201846  0.9933275  0.006787330
##   0.10       10                 100      0.6156804  0.9894588  0.024274301
##   0.10       20                  50      0.6256814  0.9918474  0.027084013
##   0.10       20                  75      0.6147826  0.9840976  0.028501017
##   0.10       20                 100      0.6030173  0.9813216  0.031751791
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 25
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 75, interaction.depth =
##  20, shrinkage = 0.01 and n.minobsinnode = 25.

Logistic Regression OU

And the OU folks constructed this logistic regression model.

lrfull<-glm(Phone_sale~., data=train, family="binomial")
options(scipen = 999)
summary(lrfull)
## 
## Call:
## glm(formula = Phone_sale ~ ., family = "binomial", data = train)
## 
## Coefficients:
##                       Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept)       -2.617141057  0.148624193 -17.609 < 0.0000000000000002 ***
## Topflight          0.187622562  0.190979630   0.982              0.32589    
## Balance           -0.000003415  0.000001051  -3.248              0.00116 ** 
## Qual_miles        -0.000004201  0.000088819  -0.047              0.96228    
## cc1_miles.        -0.262885161  0.508008633  -0.517              0.60482    
## cc2_miles.         0.071280272  0.337021734   0.212              0.83250    
## cc3_miles.         0.347307014  0.614441644   0.565              0.57191    
## Bonus_miles        0.000006288  0.000003349   1.878              0.06042 .  
## Bonus_trans        0.019978715  0.009336412   2.140              0.03237 *  
## Flight_miles_12mo  0.000077401  0.000081928   0.945              0.34479    
## Flight_trans_12   -0.023730585  0.033341861  -0.712              0.47663    
## Online_12          0.126388443  0.077907955   1.622              0.10474    
## Email              0.234557711  0.139833081   1.677              0.09346 .  
## Club_member        0.031279370  0.216001187   0.145              0.88486    
## Any_cc_miles_12mo  0.838018878  0.525279654   1.595              0.11063    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1858.5  on 2399  degrees of freedom
## Residual deviance: 1777.3  on 2385  degrees of freedom
## AIC: 1807.3
## 
## Number of Fisher Scoring iterations: 5

Logistic Regression MU

tuneGrid <- expand.grid(
  alpha  = 1,
  lambda = seq(0.0001, 1, length = 100)
)

lasso_model <- train(
  Phone_sale ~ .,
  data      = train,
  method    = "glmnet",
  trControl = ctrl_glmnet,
  metric    = "ROC",
  tuneGrid  = tuneGrid,
  family    = "binomial"
)
glmnet_fit  <- lasso_model$finalModel
best_lambda <- lasso_model$bestTune$lambda
coef_mat <- coef(glmnet_fit, s = best_lambda)

coef_df <- data.frame(
  term = rownames(as.matrix(coef_mat)),
  estimate = as.numeric(as.matrix(coef_mat)),
  stringsAsFactors = FALSE
)

active_vars <- coef_df %>%
  filter(term != "(Intercept)", estimate != 0)

terms.lasso <- active_vars$term
terms.lasso
##  [1] "Topflight"         "Balance"           "Qual_miles"       
##  [4] "cc1_miles."        "cc2_miles."        "cc3_miles."       
##  [7] "Bonus_miles"       "Bonus_trans"       "Flight_miles_12mo"
## [10] "Flight_trans_12"   "Online_12"         "Email"            
## [13] "Club_member"       "Any_cc_miles_12mo"

NN model

set.seed(13)

cvindx<-createFolds(trainIndex, k=10, returnTrain = TRUE)
ctrl <- trainControl(method="cv", index=cvindx, summaryFunction = twoClassSummary, classProbs = TRUE)

tunegrid<-expand.grid( .size=1:12, .decay= c(0.01, 0.1))
tunegrid
##    .size .decay
## 1      1   0.01
## 2      2   0.01
## 3      3   0.01
## 4      4   0.01
## 5      5   0.01
## 6      6   0.01
## 7      7   0.01
## 8      8   0.01
## 9      9   0.01
## 10    10   0.01
## 11    11   0.01
## 12    12   0.01
## 13     1   0.10
## 14     2   0.10
## 15     3   0.10
## 16     4   0.10
## 17     5   0.10
## 18     6   0.10
## 19     7   0.10
## 20     8   0.10
## 21     9   0.10
## 22    10   0.10
## 23    11   0.10
## 24    12   0.10
numWts<-500 

Note we use the variables selected from the lasso.

cl <- makePSOCKcluster(3) #Starts the parallel computing
registerDoParallel(cl)

nnetFit<-train(x=train[,terms.lasso], y=train$Phone_sale, 
               method="nnet", 
               metric="ROC", 
               linout=FALSE,
               preProcess = c("range"), 
               tuneGrid = tunegrid, 
               trace=FALSE,
               maxit=100,
               MaxNWts=numWts,
               trControl=ctrl)


stopCluster(cl) 
nnetFit
## Neural Network 
## 
## 2400 samples
##   14 predictor
##    2 classes: 'No', 'Yes' 
## 
## Pre-processing: re-scaling to [0, 1] (14) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2160, 2160, 2160, 2160, 2160, 2160, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  ROC        Sens       Spec       
##    1    0.01   0.6184818  0.9995370  0.000000000
##    1    0.10   0.6251293  0.9990697  0.000000000
##    2    0.01   0.6240871  0.9942632  0.020927215
##    2    0.10   0.6337693  0.9981190  0.006325581
##    3    0.01   0.6130470  0.9937334  0.022970297
##    3    0.10   0.6309296  0.9956595  0.019744491
##    4    0.01   0.6065989  0.9952046  0.006325581
##    4    0.10   0.6258915  0.9971201  0.019434893
##    5    0.01   0.6135885  0.9884860  0.016513932
##    5    0.10   0.6364126  0.9966400  0.012957160
##    6    0.01   0.6109005  0.9861826  0.028237055
##    6    0.10   0.6314031  0.9966323  0.015898337
##    7    0.01   0.6042524  0.9846435  0.026755480
##    7    0.10   0.6368835  0.9961546  0.012957160
##    8    0.01   0.6194569  0.9874658  0.026632449
##    8    0.10   0.6341326  0.9966400  0.012957160
##    9    0.01   0.6205872  0.9875069  0.028773369
##    9    0.10   0.6394672  0.9961823  0.013266758
##   10    0.01   0.6114289  0.9847545  0.019040110
##   10    0.10   0.6366024  0.9956873  0.015898337
##   11    0.01   0.6116548  0.9851891  0.025143727
##   11    0.10   0.6338810  0.9956873  0.022376070
##   12    0.01   0.6178583  0.9856311  0.033858255
##   12    0.10   0.6348118  0.9961524  0.012957160
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were size = 9 and decay = 0.1.
p.nnet<-predict(nnetFit, data=train, type="prob")
r<-roc(train$Phone_sale,  p.nnet[,2])
r.nnet.auct<-r$auc
r.nnet.auct
## Area under the curve: 0.6785
p.nnet<-predict(nnetFit, newdata=valid, type="prob")
r<-roc(valid$Phone_sale,  p.nnet[,2])
r.nnet.auc<-r$auc
r.nnet.auc
## Area under the curve: 0.6533