A Bank sought to use performance on an in-house subprime credit product to create an updated risk model. The risk model was to be combined with other factors to make future credit decisions. A sample of 3000 applicants for the original credit product was selected. Credit bureau data describing these individuals (at the time of application) was recorded. The ultimate disposition of the loan was determined (paid off or default). For loans rejected at the time of application, a disposition was inferred from credit bureau records on loans obtained in a similar time frame. For the Target, 1=Yes=default and 0=No=paid off. The goal of the modeling exercise is to provide a risk of default to each potential loan candidate. Below Interval=continuous and Binary/Nominal=categorical.

Data Summary

pacman::p_load(caret, ranger, pROC, dplyr, doParallel, gbm, nnet, DataExplorer)
df<-read.csv("credit.csv", stringsAsFactors = TRUE)
df$TARGET<-as.factor(df$TARGET)
df$ID<-as.factor(df$ID)
df$BanruptcyInd<-as.factor(df$BanruptcyInd)
source("data summary.R")
df$TARGET<-recode_factor(df$TARGET, "0"="No", "1"="Yes")
levels(df$TARGET)
## [1] "No"  "Yes"
prop.table(summary(df$TARGET))
##        No       Yes 
## 0.8333333 0.1666667
dim(df)
## [1] 3000   30
data.summary(df)
##                        mean         sd min max median length missing
## DerogCnt          1.4300000  2.7314685   0  51      0   3000       0
## CollectCnt        0.8570000  2.1613518   0  50      0   3000       0
## InqCnt06          3.1083333  3.4791712   0  40      2   3000       0
## InqTimeLast       3.1081081  4.6378309   0  24      1   3000     188
## InqFinanceCnt24   3.5550000  4.4775355   0  48      2   3000       0
## TLTimeFirst     170.1136667 92.8136998   6 933    151   3000       0
## TLTimeLast       11.8736667 16.3214124   0 342      7   3000       0
## TLCnt03           0.2750000  0.5820836   0   7      0   3000       0
## TLCnt12           1.8213333  1.9252655   0  15      1   3000       0
## TLCnt24           3.8823333  3.3967142   0  28      3   3000       0
## TLCnt             7.8795462  5.4215954   0  40      7   3000       3
## TLSatCnt         13.5116822  8.9317686   0  57     12   3000       4
## TLDel60Cnt        1.5220000  2.8096525   0  38      0   3000       0
## TLBadCnt24        0.5670000  1.3244228   0  16      0   3000       0
## TL75UtilCnt       3.1216822  2.6054350   0  20      3   3000      99
## TL50UtilCnt       4.0779042  3.1080759   0  23      3   3000      99
## TLDel3060Cnt24    0.7260000  1.1636331   0   8      0   3000       0
## TLDel90Cnt24      0.8146667  1.6095077   0  19      0   3000       0
## TLDel60CntAll     2.5220000  3.4072549   0  45      1   3000       0
## TLBadDerogCnt     1.4090000  2.4604343   0  47      0   3000       0
## TLDel60Cnt24      1.0683333  1.8061242   0  20      0   3000       0
##              levels mode missing
## TARGET            2   No       0
## ID             3000   66       0
## BanruptcyInd      2    0       0
## TLSum          2768   $0       0
## TLMaxSum       2798   $0       0
## TLBalHCPct      133   0%       0
## TLSatPct         95  50%       0
## TLOpenPct        92  50%       0
## TLOpen24Pct     134 100%       0

Imputation completed but not shown

plot_missing(df)

Data Partition

df<-select(df, -ID)
set.seed(13)
index<-sample(1:nrow(df), round(0.9*nrow(df)), replace = FALSE)

training<-df[index,]
dim(training)
## [1] 2700   29
valid<-df[-index,]
dim(valid)
## [1] 300  29

Logistic Regression

ctrl<-trainControl(method="none", summaryFunction = twoClassSummary, classProbs = TRUE, savePredictions = TRUE)
lrstep<-train(TARGET~., data=training, method="glmStepAIC", direction="both", metric="ROC", trControl=ctrl, trace=0)
pst<-predict.train(lrstep, type="prob")
rst<-roc(training$TARGET, pst[,2])
rst$auc
## Area under the curve: 0.7923
df.step<-lrstep$finalModel$df.null-lrstep$finalModel$df.residual
df.step
## [1] 14
lrstep$finalModel$coefficients
##     (Intercept)   BanruptcyInd1 InqFinanceCnt24     TLTimeFirst         TLCnt03 
##    -1.420980650    -0.406519986     0.050097648    -0.003323057    -0.409105906 
##           TLCnt        TLSatCnt      TLDel60Cnt     TL75UtilCnt      TLBalHCPct 
##     0.084145373    -0.066477484     0.107588295     0.139754546     0.011570515 
##        TLSatPct  TLDel3060Cnt24   TLDel60CntAll   TLBadDerogCnt     TLOpen24Pct 
##    -0.027881797     0.353049620    -0.058668940     0.052260094     0.002621397

Random Forest

set.seed(13)
cvindx<-createFolds(index, k=10, returnTrain = TRUE)
ctrl <- trainControl(method="cv", index=cvindx, summaryFunction = twoClassSummary, classProbs = TRUE)

tunegrid <- expand.grid(
  .mtry = c(5, 10, 15, 20),
  .splitrule = "gini",
  .min.node.size = c(10, 20)
)
tunegrid
##   .mtry .splitrule .min.node.size
## 1     5       gini             10
## 2    10       gini             10
## 3    15       gini             10
## 4    20       gini             10
## 5     5       gini             20
## 6    10       gini             20
## 7    15       gini             20
## 8    20       gini             20
cl <- makePSOCKcluster(7) #starts the cluster
registerDoParallel(cl)


rforest<-train(TARGET~., data=training, method="ranger", tuneGrid=tunegrid, metric="ROC", 
               num.trees=400, importance="impurity", trControl=ctrl )

stopCluster(cl)
rforest
## Random Forest 
## 
## 2700 samples
##   28 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2430, 2430, 2430, 2429, 2431, 2431, ... 
## Resampling results across tuning parameters:
## 
##   mtry  min.node.size  ROC        Sens       Spec     
##    5    10             0.7807264  0.9826166  0.1171361
##    5    20             0.7845018  0.9835107  0.1015944
##   10    10             0.7797207  0.9723473  0.1455742
##   10    20             0.7790955  0.9772391  0.1464025
##   15    10             0.7791902  0.9683506  0.1771778
##   15    20             0.7787742  0.9692245  0.1717918
##   20    10             0.7758084  0.9648320  0.1835606
##   20    20             0.7758921  0.9647273  0.1770231
## 
## Tuning parameter 'splitrule' was held constant at a value of gini
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 5, splitrule = gini
##  and min.node.size = 20.
p.rforest.t<-predict(rforest, data=train, type="prob")
rt<-roc(training$TARGET,  p.rforest.t[,2])
rt$auc
## Area under the curve: 0.9948

Boosted Tree

Grid <- expand.grid( n.trees = seq(50,150,50), interaction.depth = c(10, 20), shrinkage = c(0.1, 0.01), n.minobsinnode=c(25))
Grid
##    n.trees interaction.depth shrinkage n.minobsinnode
## 1       50                10      0.10             25
## 2      100                10      0.10             25
## 3      150                10      0.10             25
## 4       50                20      0.10             25
## 5      100                20      0.10             25
## 6      150                20      0.10             25
## 7       50                10      0.01             25
## 8      100                10      0.01             25
## 9      150                10      0.01             25
## 10      50                20      0.01             25
## 11     100                20      0.01             25
## 12     150                20      0.01             25
cl <- makePSOCKcluster(7) #starts the cluster
registerDoParallel(cl)

gb.tree <- train(TARGET~., 
                 data=training, 
                 method = 'gbm', 
                 trControl=ctrl, 
                 tuneGrid=Grid, 
                 metric='ROC')
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9076             nan     0.0100    0.0016
##      2        0.9034             nan     0.0100    0.0016
##      3        0.8992             nan     0.0100    0.0016
##      4        0.8958             nan     0.0100    0.0011
##      5        0.8918             nan     0.0100    0.0013
##      6        0.8881             nan     0.0100    0.0013
##      7        0.8843             nan     0.0100    0.0014
##      8        0.8805             nan     0.0100    0.0013
##      9        0.8771             nan     0.0100    0.0010
##     10        0.8734             nan     0.0100    0.0013
##     20        0.8424             nan     0.0100    0.0008
##     40        0.7937             nan     0.0100    0.0008
##     60        0.7562             nan     0.0100    0.0004
##     80        0.7255             nan     0.0100    0.0003
##    100        0.7000             nan     0.0100    0.0002
##    120        0.6775             nan     0.0100    0.0001
##    140        0.6581             nan     0.0100    0.0001
##    150        0.6491             nan     0.0100    0.0001
stopCluster(cl)

gb.tree
## Stochastic Gradient Boosting 
## 
## 2700 samples
##   28 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2430, 2430, 2430, 2429, 2431, 2431, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  ROC        Sens       Spec      
##   0.01       10                  50      0.7814997  1.0000000  0.00000000
##   0.01       10                 100      0.7856315  0.9981836  0.02673905
##   0.01       10                 150      0.7887540  0.9896939  0.07278545
##   0.01       20                  50      0.7857880  1.0000000  0.00000000
##   0.01       20                 100      0.7908018  0.9982167  0.02604960
##   0.01       20                 150      0.7912795  0.9901522  0.08285713
##   0.10       10                  50      0.7748154  0.9525918  0.25662565
##   0.10       10                 100      0.7640528  0.9445376  0.26443601
##   0.10       10                 150      0.7541765  0.9418990  0.25834650
##   0.10       20                  50      0.7712585  0.9532218  0.22485030
##   0.10       20                 100      0.7592403  0.9444463  0.24041818
##   0.10       20                 150      0.7539450  0.9431058  0.25677088
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 25
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
##  20, shrinkage = 0.01 and n.minobsinnode = 25.
p.gbtree<-predict(gb.tree, data=training, type="prob")
r.gbt<-roc(training$TARGET,  p.gbtree[,2])
r.gbt$auc
## Area under the curve: 0.8957

Neural Network

terms<-names(lrstep$finalModel$coefficients)[-1]
terms2<-gsub('.{1}$', '', terms[c(1)]) 
terms<-c(terms2,terms[2:14])
terms
##  [1] "BanruptcyInd"    "InqFinanceCnt24" "TLTimeFirst"     "TLCnt03"        
##  [5] "TLCnt"           "TLSatCnt"        "TLDel60Cnt"      "TL75UtilCnt"    
##  [9] "TLBalHCPct"      "TLSatPct"        "TLDel3060Cnt24"  "TLDel60CntAll"  
## [13] "TLBadDerogCnt"   "TLOpen24Pct"
length(terms)
## [1] 14
tunegrid<-expand.grid( .size=1:10, .decay= c(0.1, 0.1, 0.5))
numWts<-500
cl <- makePSOCKcluster(7)
registerDoParallel(cl)

nnetFit<-train(x=training[,terms], y=training$TARGET, 
               method="nnet", 
               metric="ROC", 
               linout=FALSE,
               preProcess = c("range"), 
               tuneGrid = tunegrid, 
              
               trace=FALSE,
               maxit=100,
               MaxNWts=numWts,
               trControl=ctrl)

stopCluster(cl)
nnetFit
## Neural Network 
## 
## 2700 samples
##   14 predictor
##    2 classes: 'No', 'Yes' 
## 
## Pre-processing: re-scaling to [0, 1] (13), ignore (1) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2430, 2430, 2430, 2429, 2431, 2431, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  ROC        Sens       Spec     
##    1    0.1    0.7826621  0.9725920  0.1782356
##    1    0.5    0.7832237  0.9797481  0.1318634
##    2    0.1    0.7827776  0.9740201  0.1803045
##    2    0.5    0.7826682  0.9779356  0.1482085
##    3    0.1    0.7801947  0.9708733  0.1736824
##    3    0.5    0.7828413  0.9779356  0.1435650
##    4    0.1    0.7851653  0.9709098  0.1967197
##    4    0.5    0.7827301  0.9779356  0.1443070
##    5    0.1    0.7822174  0.9700344  0.1891950
##    5    0.5    0.7827711  0.9779356  0.1461251
##    6    0.1    0.7841436  0.9718167  0.1898472
##    6    0.5    0.7828995  0.9779356  0.1461251
##    7    0.1    0.7838348  0.9678145  0.1889150
##    7    0.5    0.7829185  0.9779356  0.1461251
##    8    0.1    0.7835543  0.9691534  0.1880951
##    8    0.5    0.7828257  0.9779356  0.1461251
##    9    0.1    0.7859040  0.9682703  0.1808231
##    9    0.5    0.7829179  0.9779356  0.1484507
##   10    0.1    0.7843533  0.9696346  0.1869514
##   10    0.5    0.7827680  0.9779356  0.1461251
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were size = 9 and decay = 0.1.
p.nnet<-predict(nnetFit, data=training, type="prob")
r.nnt<-roc(training$TARGET,  p.nnet[,2])
r.nnt$auc
## Area under the curve: 0.8094

Model Comparison

Model ROC.Training ROC.Validation df
Linear Step 0.7922924 0.8212638 14
Random Forest 0.9948397 0.7655146
GB Tree 0.8957447 0.7837838
NN_log 0.8093949 0.8221113

Model Justification

r<-roc(valid$TARGET,  p.rforest[,2])
coords(r,"best", ret="threshold")
##   threshold
## 1 0.2710543
eval$Cut<-as.factor(ifelse(eval$NN > 0.2710543, "Yes", "No"))
confusionMatrix(data=eval$Cut, reference=eval$Target, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes  22  33
##        No   19 226
##                                          
##                Accuracy : 0.8267         
##                  95% CI : (0.779, 0.8678)
##     No Information Rate : 0.8633         
##     P-Value [Acc > NIR] : 0.97024        
##                                          
##                   Kappa : 0.3578         
##                                          
##  Mcnemar's Test P-Value : 0.07142        
##                                          
##             Sensitivity : 0.53659        
##             Specificity : 0.87259        
##          Pos Pred Value : 0.40000        
##          Neg Pred Value : 0.92245        
##              Prevalence : 0.13667        
##          Detection Rate : 0.07333        
##    Detection Prevalence : 0.18333        
##       Balanced Accuracy : 0.70459        
##                                          
##        'Positive' Class : Yes            
##