A Bank sought to use performance on an in-house subprime credit product to create an updated risk model. The risk model was to be combined with other factors to make future credit decisions. A sample of 3000 applicants for the original credit product was selected. Credit bureau data describing these individuals (at the time of application) was recorded. The ultimate disposition of the loan was determined (paid off or default). For loans rejected at the time of application, a disposition was inferred from credit bureau records on loans obtained in a similar time frame. For the Target, 1=Yes=default and 0=No=paid off. The goal of the modeling exercise is to provide a risk of default to each potential loan candidate. Below Interval=continuous and Binary/Nominal=categorical.
pacman::p_load(caret, ranger, pROC, dplyr, doParallel, gbm, nnet, DataExplorer)
df<-read.csv("credit.csv", stringsAsFactors = TRUE)
df$TARGET<-as.factor(df$TARGET)
df$ID<-as.factor(df$ID)
df$BanruptcyInd<-as.factor(df$BanruptcyInd)
source("data summary.R")
df$TARGET<-recode_factor(df$TARGET, "0"="No", "1"="Yes")
levels(df$TARGET)
## [1] "No" "Yes"
prop.table(summary(df$TARGET))
## No Yes
## 0.8333333 0.1666667
dim(df)
## [1] 3000 30
data.summary(df)
## mean sd min max median length missing
## DerogCnt 1.4300000 2.7314685 0 51 0 3000 0
## CollectCnt 0.8570000 2.1613518 0 50 0 3000 0
## InqCnt06 3.1083333 3.4791712 0 40 2 3000 0
## InqTimeLast 3.1081081 4.6378309 0 24 1 3000 188
## InqFinanceCnt24 3.5550000 4.4775355 0 48 2 3000 0
## TLTimeFirst 170.1136667 92.8136998 6 933 151 3000 0
## TLTimeLast 11.8736667 16.3214124 0 342 7 3000 0
## TLCnt03 0.2750000 0.5820836 0 7 0 3000 0
## TLCnt12 1.8213333 1.9252655 0 15 1 3000 0
## TLCnt24 3.8823333 3.3967142 0 28 3 3000 0
## TLCnt 7.8795462 5.4215954 0 40 7 3000 3
## TLSatCnt 13.5116822 8.9317686 0 57 12 3000 4
## TLDel60Cnt 1.5220000 2.8096525 0 38 0 3000 0
## TLBadCnt24 0.5670000 1.3244228 0 16 0 3000 0
## TL75UtilCnt 3.1216822 2.6054350 0 20 3 3000 99
## TL50UtilCnt 4.0779042 3.1080759 0 23 3 3000 99
## TLDel3060Cnt24 0.7260000 1.1636331 0 8 0 3000 0
## TLDel90Cnt24 0.8146667 1.6095077 0 19 0 3000 0
## TLDel60CntAll 2.5220000 3.4072549 0 45 1 3000 0
## TLBadDerogCnt 1.4090000 2.4604343 0 47 0 3000 0
## TLDel60Cnt24 1.0683333 1.8061242 0 20 0 3000 0
## levels mode missing
## TARGET 2 No 0
## ID 3000 66 0
## BanruptcyInd 2 0 0
## TLSum 2768 $0 0
## TLMaxSum 2798 $0 0
## TLBalHCPct 133 0% 0
## TLSatPct 95 50% 0
## TLOpenPct 92 50% 0
## TLOpen24Pct 134 100% 0
plot_missing(df)
df<-select(df, -ID)
set.seed(13)
index<-sample(1:nrow(df), round(0.9*nrow(df)), replace = FALSE)
training<-df[index,]
dim(training)
## [1] 2700 29
valid<-df[-index,]
dim(valid)
## [1] 300 29
ctrl<-trainControl(method="none", summaryFunction = twoClassSummary, classProbs = TRUE, savePredictions = TRUE)
lrstep<-train(TARGET~., data=training, method="glmStepAIC", direction="both", metric="ROC", trControl=ctrl, trace=0)
pst<-predict.train(lrstep, type="prob")
rst<-roc(training$TARGET, pst[,2])
rst$auc
## Area under the curve: 0.7923
df.step<-lrstep$finalModel$df.null-lrstep$finalModel$df.residual
df.step
## [1] 14
lrstep$finalModel$coefficients
## (Intercept) BanruptcyInd1 InqFinanceCnt24 TLTimeFirst TLCnt03
## -1.420980650 -0.406519986 0.050097648 -0.003323057 -0.409105906
## TLCnt TLSatCnt TLDel60Cnt TL75UtilCnt TLBalHCPct
## 0.084145373 -0.066477484 0.107588295 0.139754546 0.011570515
## TLSatPct TLDel3060Cnt24 TLDel60CntAll TLBadDerogCnt TLOpen24Pct
## -0.027881797 0.353049620 -0.058668940 0.052260094 0.002621397
set.seed(13)
cvindx<-createFolds(index, k=10, returnTrain = TRUE)
ctrl <- trainControl(method="cv", index=cvindx, summaryFunction = twoClassSummary, classProbs = TRUE)
tunegrid <- expand.grid(
.mtry = c(5, 10, 15, 20),
.splitrule = "gini",
.min.node.size = c(10, 20)
)
tunegrid
## .mtry .splitrule .min.node.size
## 1 5 gini 10
## 2 10 gini 10
## 3 15 gini 10
## 4 20 gini 10
## 5 5 gini 20
## 6 10 gini 20
## 7 15 gini 20
## 8 20 gini 20
cl <- makePSOCKcluster(7) #starts the cluster
registerDoParallel(cl)
rforest<-train(TARGET~., data=training, method="ranger", tuneGrid=tunegrid, metric="ROC",
num.trees=400, importance="impurity", trControl=ctrl )
stopCluster(cl)
rforest
## Random Forest
##
## 2700 samples
## 28 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2430, 2430, 2430, 2429, 2431, 2431, ...
## Resampling results across tuning parameters:
##
## mtry min.node.size ROC Sens Spec
## 5 10 0.7807264 0.9826166 0.1171361
## 5 20 0.7845018 0.9835107 0.1015944
## 10 10 0.7797207 0.9723473 0.1455742
## 10 20 0.7790955 0.9772391 0.1464025
## 15 10 0.7791902 0.9683506 0.1771778
## 15 20 0.7787742 0.9692245 0.1717918
## 20 10 0.7758084 0.9648320 0.1835606
## 20 20 0.7758921 0.9647273 0.1770231
##
## Tuning parameter 'splitrule' was held constant at a value of gini
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 5, splitrule = gini
## and min.node.size = 20.
p.rforest.t<-predict(rforest, data=train, type="prob")
rt<-roc(training$TARGET, p.rforest.t[,2])
rt$auc
## Area under the curve: 0.9948
Grid <- expand.grid( n.trees = seq(50,150,50), interaction.depth = c(10, 20), shrinkage = c(0.1, 0.01), n.minobsinnode=c(25))
Grid
## n.trees interaction.depth shrinkage n.minobsinnode
## 1 50 10 0.10 25
## 2 100 10 0.10 25
## 3 150 10 0.10 25
## 4 50 20 0.10 25
## 5 100 20 0.10 25
## 6 150 20 0.10 25
## 7 50 10 0.01 25
## 8 100 10 0.01 25
## 9 150 10 0.01 25
## 10 50 20 0.01 25
## 11 100 20 0.01 25
## 12 150 20 0.01 25
cl <- makePSOCKcluster(7) #starts the cluster
registerDoParallel(cl)
gb.tree <- train(TARGET~.,
data=training,
method = 'gbm',
trControl=ctrl,
tuneGrid=Grid,
metric='ROC')
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9076 nan 0.0100 0.0016
## 2 0.9034 nan 0.0100 0.0016
## 3 0.8992 nan 0.0100 0.0016
## 4 0.8958 nan 0.0100 0.0011
## 5 0.8918 nan 0.0100 0.0013
## 6 0.8881 nan 0.0100 0.0013
## 7 0.8843 nan 0.0100 0.0014
## 8 0.8805 nan 0.0100 0.0013
## 9 0.8771 nan 0.0100 0.0010
## 10 0.8734 nan 0.0100 0.0013
## 20 0.8424 nan 0.0100 0.0008
## 40 0.7937 nan 0.0100 0.0008
## 60 0.7562 nan 0.0100 0.0004
## 80 0.7255 nan 0.0100 0.0003
## 100 0.7000 nan 0.0100 0.0002
## 120 0.6775 nan 0.0100 0.0001
## 140 0.6581 nan 0.0100 0.0001
## 150 0.6491 nan 0.0100 0.0001
stopCluster(cl)
gb.tree
## Stochastic Gradient Boosting
##
## 2700 samples
## 28 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2430, 2430, 2430, 2429, 2431, 2431, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees ROC Sens Spec
## 0.01 10 50 0.7814997 1.0000000 0.00000000
## 0.01 10 100 0.7856315 0.9981836 0.02673905
## 0.01 10 150 0.7887540 0.9896939 0.07278545
## 0.01 20 50 0.7857880 1.0000000 0.00000000
## 0.01 20 100 0.7908018 0.9982167 0.02604960
## 0.01 20 150 0.7912795 0.9901522 0.08285713
## 0.10 10 50 0.7748154 0.9525918 0.25662565
## 0.10 10 100 0.7640528 0.9445376 0.26443601
## 0.10 10 150 0.7541765 0.9418990 0.25834650
## 0.10 20 50 0.7712585 0.9532218 0.22485030
## 0.10 20 100 0.7592403 0.9444463 0.24041818
## 0.10 20 150 0.7539450 0.9431058 0.25677088
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 25
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 20, shrinkage = 0.01 and n.minobsinnode = 25.
p.gbtree<-predict(gb.tree, data=training, type="prob")
r.gbt<-roc(training$TARGET, p.gbtree[,2])
r.gbt$auc
## Area under the curve: 0.8957
terms<-names(lrstep$finalModel$coefficients)[-1]
terms2<-gsub('.{1}$', '', terms[c(1)])
terms<-c(terms2,terms[2:14])
terms
## [1] "BanruptcyInd" "InqFinanceCnt24" "TLTimeFirst" "TLCnt03"
## [5] "TLCnt" "TLSatCnt" "TLDel60Cnt" "TL75UtilCnt"
## [9] "TLBalHCPct" "TLSatPct" "TLDel3060Cnt24" "TLDel60CntAll"
## [13] "TLBadDerogCnt" "TLOpen24Pct"
length(terms)
## [1] 14
tunegrid<-expand.grid( .size=1:10, .decay= c(0.1, 0.1, 0.5))
numWts<-500
cl <- makePSOCKcluster(7)
registerDoParallel(cl)
nnetFit<-train(x=training[,terms], y=training$TARGET,
method="nnet",
metric="ROC",
linout=FALSE,
preProcess = c("range"),
tuneGrid = tunegrid,
trace=FALSE,
maxit=100,
MaxNWts=numWts,
trControl=ctrl)
stopCluster(cl)
nnetFit
## Neural Network
##
## 2700 samples
## 14 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: re-scaling to [0, 1] (13), ignore (1)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2430, 2430, 2430, 2429, 2431, 2431, ...
## Resampling results across tuning parameters:
##
## size decay ROC Sens Spec
## 1 0.1 0.7826621 0.9725920 0.1782356
## 1 0.5 0.7832237 0.9797481 0.1318634
## 2 0.1 0.7827776 0.9740201 0.1803045
## 2 0.5 0.7826682 0.9779356 0.1482085
## 3 0.1 0.7801947 0.9708733 0.1736824
## 3 0.5 0.7828413 0.9779356 0.1435650
## 4 0.1 0.7851653 0.9709098 0.1967197
## 4 0.5 0.7827301 0.9779356 0.1443070
## 5 0.1 0.7822174 0.9700344 0.1891950
## 5 0.5 0.7827711 0.9779356 0.1461251
## 6 0.1 0.7841436 0.9718167 0.1898472
## 6 0.5 0.7828995 0.9779356 0.1461251
## 7 0.1 0.7838348 0.9678145 0.1889150
## 7 0.5 0.7829185 0.9779356 0.1461251
## 8 0.1 0.7835543 0.9691534 0.1880951
## 8 0.5 0.7828257 0.9779356 0.1461251
## 9 0.1 0.7859040 0.9682703 0.1808231
## 9 0.5 0.7829179 0.9779356 0.1484507
## 10 0.1 0.7843533 0.9696346 0.1869514
## 10 0.5 0.7827680 0.9779356 0.1461251
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were size = 9 and decay = 0.1.
p.nnet<-predict(nnetFit, data=training, type="prob")
r.nnt<-roc(training$TARGET, p.nnet[,2])
r.nnt$auc
## Area under the curve: 0.8094
Model | ROC.Training | ROC.Validation | df |
---|---|---|---|
Linear Step | 0.7922924 | 0.8212638 | 14 |
Random Forest | 0.9948397 | 0.7655146 |
|
GB Tree | 0.8957447 | 0.7837838 |
|
NN_log | 0.8093949 | 0.8221113 |
|
r<-roc(valid$TARGET, p.rforest[,2])
coords(r,"best", ret="threshold")
## threshold
## 1 0.2710543
eval$Cut<-as.factor(ifelse(eval$NN > 0.2710543, "Yes", "No"))
confusionMatrix(data=eval$Cut, reference=eval$Target, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 22 33
## No 19 226
##
## Accuracy : 0.8267
## 95% CI : (0.779, 0.8678)
## No Information Rate : 0.8633
## P-Value [Acc > NIR] : 0.97024
##
## Kappa : 0.3578
##
## Mcnemar's Test P-Value : 0.07142
##
## Sensitivity : 0.53659
## Specificity : 0.87259
## Pos Pred Value : 0.40000
## Neg Pred Value : 0.92245
## Prevalence : 0.13667
## Detection Rate : 0.07333
## Detection Prevalence : 0.18333
## Balanced Accuracy : 0.70459
##
## 'Positive' Class : Yes
##