Exam 1 Output Packet

The goal of this analysis is to predict whether a person’s income was large (defined as more than $50k). We will consider the target variable as income where “1=large” and “0=small”. The predictors include: age, workclass, education, education-num (number of years someone was in school), marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country. The data was obtained from census data. We would are interested in specifically identifying people with a large income.

pacman::p_load(DataExplorer, tidyverse, caret, skimr, ROCR, pROC, fastDummies)

head(df)

##   age        workclass education-num     marital-status        occupation
## 1  39        State-gov            13      Never-married      Adm-clerical
## 2  50 Self-emp-not-inc            13 Married-civ-spouse   Exec-managerial
## 3  38          Private             9           Divorced Handlers-cleaners
## 4  53          Private             7 Married-civ-spouse Handlers-cleaners
## 5  28          Private            13 Married-civ-spouse    Prof-specialty
## 6  37          Private            14 Married-civ-spouse   Exec-managerial
##    relationship  race    sex capital-gain capital-loss hours-per-week
## 1 Not-in-family White   Male         2174            0             40
## 2       Husband White   Male            0            0             13
## 3 Not-in-family White   Male            0            0             40
## 4       Husband Black   Male            0            0             40
## 5          Wife Black Female            0            0             40
## 6          Wife White Female            0            0             40
##   native-country income
## 1  United-States  small
## 2  United-States  small
## 3  United-States  small
## 4  United-States  small
## 5           Cuba  small
## 6  United-States  small

Part 1: Perpare Data for Modeling

Data Summary

Here is a summary of the target variable.

prop.table(table(df$income))

## 
##      small      large 
## 0.96111975 0.03888025

table(df$income)

## 
## small large 
## 24720  1000

Here is a summary of the data:

Data summary
Name	df
Number of rows	25720
Number of columns	13
_______________________
Column type frequency:
factor	8
numeric	5
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
workclass	1669	0.94	FALSE	8	Pri: 18376, Sel: 1902, Loc: 1535, Sta: 996
marital-status	0	1.00	FALSE	7	Nev: 10251, Mar: 9151, Div: 4028, Sep: 969
occupation	1676	0.93	FALSE	14	Adm: 3324, Cra: 3290, Oth: 3180, Sal: 2810
relationship	0	1.00	FALSE	6	Hus: 8027, Not: 7548, Own: 5007, Unm: 3257
race	0	1.00	FALSE	5	Whi: 21602, Bla: 2790, Asi: 804, Ame: 277
sex	0	1.00	FALSE	2	Mal: 15966, Fem: 9754
native-country	461	0.98	FALSE	41	Uni: 22901, Mex: 616, Phi: 143, Pue: 102
income	0	1.00	TRUE	2	sma: 24720, lar: 1000

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
age	1	37.07	13.96	17	26	35	46	90
education-num	1	9.67	2.46	1	9	9	11	16
capital-gain	1	281.43	2885.24	0	0	0	0	99999
capital-loss	1	59.06	328.54	0	0	0	0	4356
hours-per-week	1	39.10	12.33	1	36	40	40	99

plot_missing(df)

plot_histogram(df)

Categorical Data Preperation

Here is a summary of the native-country variable.

plot_bar(df$`native-country`)

Dummy variables were created.

df<-dummy_columns(df, select_columns = c("workclass", "marital-status", "occupation", "relationship", "race", "sex"), remove_first_dummy = TRUE, remove_selected_columns = TRUE)

Summary of marital-status_Married-AF-spouse.

summary(as.factor(df$`marital-status_Married-AF-spouse`))

##     0     1 
## 25707    13

Partitioning and Over-Sampling

Here are the proportions for the validation data set.

set.seed(13)
trainIndex<-caret::createDataPartition(df$income, p=0.8, list=FALSE, times=1)
head(trainIndex)

##      Resample1
## [1,]         1
## [2,]         2
## [3,]         3
## [4,]         4
## [5,]         5
## [6,]         6

valid<-df[-trainIndex,]

prop.table(table(valid$income))

## 
##      small      large 
## 0.95974576 0.04025424

dim(valid)

## [1] 4720   40

Here are the proportions for the training data set.

prop.table(table(training$income))

## 
## small large 
##   0.5   0.5

Part 2: Modeling

I constructed two models.

Compare the models

Model 1

r<-pROC::roc(valid$income, preds, levels=c("small", "large"), direction="<")
r$auc

## Area under the curve: 0.9016

Model 2

r1<-roc(valid$income, preds1, levels=c("small", "large"), direction="<")
r1$auc

## Area under the curve: 0.8024

roc<-list(r, r1)
p<-ggroc(roc)+theme_bw()+scale_colour_discrete("Model")
p

This is for model 1.

threshold<-coords(r,"best", ret="threshold")
threshold

##   threshold
## 1  0.527188

Here is a summary of the income variable in the validation data.

summary(valid$income)

## small large 
##  4530   190

valid$preds<-preds
valid$cut1<-ifelse(valid$preds>threshold$threshold, 'large', 'small')
valid$cut1<-factor(valid$cut1, levels=c("small", "large")) #making sure the levels match up so we are predicting large
caret::confusionMatrix(valid$cut1, valid$income, positive="large")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction small large
##      small  3634    31
##      large   896   159
##                                          
##                Accuracy : 0.8036         
##                  95% CI : (0.792, 0.8149)
##     No Information Rate : 0.9597         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2009         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.83684        
##             Specificity : 0.80221        
##          Pos Pred Value : 0.15071        
##          Neg Pred Value : 0.99154        
##              Prevalence : 0.04025        
##          Detection Rate : 0.03369        
##    Detection Prevalence : 0.22352        
##       Balanced Accuracy : 0.81952        
##                                          
##        'Positive' Class : large          
##

Below is a small summary from a lift analysis for model 1 on the validation data.

##       Cutoffs True_Positive Counts
## 96  0.9430429            53     95
## 97  0.9422935            53     96
## 98  0.9417224            53     97
## 99  0.9412763            54     98
## 100 0.9410510            54     99

Part 3: Principal Components Analysis

Let’s pretend we are going to construct a model to predict Class, using only the first 4 columns in the Sonar data (because this is an exam, not real life). And because it is an exam, I conducted principal components analysis.

library(mlbench)
data("Sonar")
Sonar<-Sonar[,1:4]
skim_without_charts(Sonar)

Data summary
Name	Sonar
Number of rows	208
Number of columns	4
_______________________
Column type frequency:
numeric	4
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
V1	1	0.03	0.02	0.00	0.01	0.02	0.04	0.14
V2	1	0.04	0.03	0.00	0.02	0.03	0.05	0.23
V3	1	0.04	0.04	0.00	0.02	0.03	0.06	0.31
V4	1	0.05	0.05	0.01	0.02	0.04	0.06	0.43

pca<-prcomp(Sonar,scale=T)
pca$rotation

##          PC1        PC2        PC3        PC4
## V1 0.4638718  0.6853104  0.5056565 -0.2438937
## V2 0.5244959  0.2658007 -0.5597689  0.5838774
## V3 0.5273907 -0.3233094 -0.3778910 -0.6888603
## V4 0.4812340 -0.5959620  0.5368134  0.3536587

head(scale(Sonar))

##            V1         V2          V3         V4
## 1 -0.39858974 -0.0405504 -0.02686084 -0.7133841
## 2  0.70184499  0.4206156  1.05307772  0.3225521
## 3 -0.12891799  0.5996209  1.71925669  1.1693547
## 4 -0.83354417 -0.6473478  0.48058017 -0.7176825
## 5  2.04585419  0.8544758  0.11105902 -0.3114752
## 6 -0.02452892  0.2082365 -0.41980235 -0.7843091

Exam 1 Output Packet

Part 1: Perpare Data for Modeling

Data Summary

Categorical Data Preperation

Partitioning and Over-Sampling

Part 2: Modeling

Compare the models

Part 3: Principal Components Analysis

Part 4: Screen Captures

File Explorer

Enviroment

Code Chunks

Table 1

Table 2

Table 3