Introduction

Lending Club is a US peer-to-peer lending company and the world’s largest peer-to-peer lending platform. As explained by Wikipedia.

The goal of this Project is using the data from Lending club (2019Q3 to 2020Q2), conducting as set of explorator analysis and applying multiple machine learning algorithm techniques to predict the customer default rate.

Data files contain complete loan data for all loans issued through the time period stated, including the current loan status (Current, Late, Fully Paid, etc.) and latest payment information.
Based on the goal, we will only focus on the records with loan status Fully Paid, Charged off, Default and Late. We will remove the records that not in these status. and group Charged off, Default and Late into one call Default.

Reading in the data

setwd("/Users/huitingsheng/Downloads/Github/Lending_Club_EDA_and_Modeling")
data1 = read.csv("LoanStats_securev1_2019Q3.csv")
data2 = read.csv("LoanStats_securev1_2019Q4.csv")
data3 = read.csv("LoanStats_securev1_2020Q1.csv")
data4 = read.csv("LoanStats_securev1_2020Q2.csv")
data = rbind(data1,data2,data3, data4)
dim(data)
## [1] 389627    150

Installing/Loading Packages

packages=c("caret","ggplot2", "tidyverse", "dplyr", "corrplot","e1071", "reshape2","lubridate","usmap", "glmnet", "pROC","doParallel", "ranger","lattice","gridExtra", "kableExtra", "ROSE", "DMwR")
# Now load or install & load all
package.check <- lapply(packages,FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x, character.only = TRUE)
    }
  }
)

Remove all records that loan Status is Current and Issued

Our goal is the predict the default rate of customers,so before we move forward to clean up the data. Let’s combine loan status. We only want records that either fully paid, or default, charged off,or late. Default, Charged off and Late are consider as Risky Customer that we want to prevent.
As we don’t know whether the current customer and issued customer will be fully paid or turn into charged off. so we remove all the current and issued records.and combine Default, Charged off and Late into a group called Charged_off

df = data
as.data.frame(table(df$loan_status))
##                 Var1   Freq
## 1                        16
## 2        Charged Off   6387
## 3            Current 333956
## 4            Default     67
## 5         Fully Paid  40207
## 6    In Grace Period   4070
## 7  Late (16-30 days)   1384
## 8 Late (31-120 days)   3531
## 9             Issued      9
ggplot(data=df,aes(y=loan_status)) + geom_bar()

# remove all current records
df = df %>%
  filter(loan_status !="Current" & loan_status !="Issued" & loan_status !="") %>% 
  mutate(loan_status = if_else(loan_status %in% 
                        c("Fully Paid"," Does not meet the credit policy. Status:Fully Paid "), 
                        "Fully_Paid","Charged_off"))

as.data.frame(table(df$loan_status))
##          Var1  Freq
## 1 Charged_off 15439
## 2  Fully_Paid 40207

Charged_off and Fully_paid is unbalanced. Will use down sampling while build models.

Combining Application type

There are two types of application type, individual and joint. But there are only 10% application is joint application. I want to transform related features before formal data cleansing, in case all join application info get drop off.

df = df %>% 
  mutate(annual_inc = ifelse(application_type == "Joint App",annual_inc_joint,annual_inc), 
         dti = ifelse(application_type == "Joint App", dti_joint,dti),
         verification_status = ifelse(application_type == "Joint App",
                                      verification_status_joint,verification_status)) %>%
  select(-c(annual_inc_joint,dti_joint,verification_status_joint))

drop columns that have no variance or low variance

No Variance, and low variance features don’t bring any useful info to us model. so we are going to drop them.

nzv = colnames(df)[nearZeroVar(df, allowParallel = TRUE)]
cat("Zero or low Variance Columns are: \n", nzv)
## Zero or low Variance Columns are: 
##  member_id pymnt_plan desc total_rec_late_fee recoveries collection_recovery_fee collections_12_mths_ex_med policy_code acc_now_delinq tot_coll_amt chargeoff_within_12_mths delinq_amnt num_tl_120dpd_2m num_tl_30dpd num_tl_90g_dpd_24m pct_tl_nvr_dlq tax_liens sec_app_earliest_cr_line sec_app_chargeoff_within_12_mths sec_app_collections_12_mths_ex_med hardship_amount hardship_end_date payment_plan_start_date hardship_dpd debt_settlement_flag debt_settlement_flag_date settlement_status settlement_date
data_wo_nzv = df %>% select(-all_of(nzv)) # drop no and low variance features

We have dropped 28 features.

Drop features missing more than 50% data

A feature missing more than 50%, then we want to drop them, as it does not bring extra info to our model.

# checking fraction of missing value for each feature
fraction=c()
per = c()
# check every columns that missing data is above 50%
for(i in 1:dim(data_wo_nzv)[2]){
  temp = sum(is.na(data_wo_nzv[,i]))
  perc = temp/dim(data_wo_nzv)[1]
  per = append(per, perc)
  fraction=rbind(fraction,c(colnames(data_wo_nzv)[i], temp, perc))
}
fraction = as.data.frame(fraction)
colnames(fraction)=c("name","count","missing_fraction")
fraction%>%kbl() %>% kable_styling( font_size = 12) %>%scroll_box(width = "100%", height="300px", fixed_thead=T )
name count missing_fraction
id 0 0
loan_amnt 0 0
funded_amnt 0 0
funded_amnt_inv 0 0
term 0 0
int_rate 0 0
installment 0 0
grade 0 0
sub_grade 0 0
emp_title 1 1.79707436293714e-05
emp_length 0 0
home_ownership 0 0
annual_inc 0 0
verification_status 0 0
issue_d 0 0
loan_status 0 0
url 0 0
purpose 0 0
title 0 0
zip_code 0 0
addr_state 0 0
dti 0 0
delinq_2yrs 0 0
earliest_cr_line 0 0
fico_range_low 0 0
fico_range_high 0 0
inq_last_6mths 0 0
mths_since_last_delinq 31076 0.558458829026345
mths_since_last_record 49227 0.884645796643065
open_acc 0 0
pub_rec 0 0
revol_bal 0 0
revol_util 0 0
total_acc 0 0
initial_list_status 0 0
out_prncp 0 0
out_prncp_inv 0 0
total_pymnt 0 0
total_pymnt_inv 0 0
total_rec_prncp 0 0
total_rec_int 0 0
last_pymnt_d 0 0
last_pymnt_amnt 0 0
next_pymnt_d 0 0
last_credit_pull_d 0 0
last_fico_range_high 0 0
last_fico_range_low 0 0
mths_since_last_major_derog 43194 0.776228300327068
application_type 0 0
tot_cur_bal 0 0
open_acc_6m 0 0
open_act_il 0 0
open_il_12m 0 0
open_il_24m 0 0
mths_since_rcnt_il 1279 0.022984581101966
total_bal_il 0 0
il_util 7733 0.138967760485929
open_rv_12m 0 0
open_rv_24m 0 0
max_bal_bc 0 0
all_util 20 0.000359414872587428
total_rev_hi_lim 0 0
inq_fi 0 0
total_cu_tl 0 0
inq_last_12m 0 0
acc_open_past_24mths 0 0
avg_cur_bal 10 0.000179707436293714
bc_open_to_buy 872 0.0156704884448118
bc_util 908 0.0163174352154692
mo_sin_old_il_acct 1279 0.022984581101966
mo_sin_old_rev_tl_op 0 0
mo_sin_rcnt_rev_tl_op 0 0
mo_sin_rcnt_tl 0 0
mort_acc 0 0
mths_since_recent_bc 814 0.0146281853143083
mths_since_recent_bc_dlq 44081 0.79216834992632
mths_since_recent_inq 4438 0.0797541602271502
mths_since_recent_revol_delinq 39139 0.703356934909967
num_accts_ever_120_pd 0 0
num_actv_bc_tl 0 0
num_actv_rev_tl 0 0
num_bc_sats 0 0
num_bc_tl 0 0
num_il_tl 0 0
num_op_rev_tl 0 0
num_rev_accts 0 0
num_rev_tl_bal_gt_0 0 0
num_sats 0 0
num_tl_op_past_12m 0 0
percent_bc_gt_75 874 0.0157064299320706
pub_rec_bankruptcies 0 0
tot_hi_cred_lim 0 0
total_bal_ex_mort 0 0
total_bc_limit 0 0
total_il_high_credit_limit 0 0
revol_bal_joint 48503 0.8716349782554
sec_app_fico_range_low 48503 0.8716349782554
sec_app_fico_range_high 48503 0.8716349782554
sec_app_inq_last_6mths 48503 0.8716349782554
sec_app_mort_acc 48503 0.8716349782554
sec_app_open_acc 48503 0.8716349782554
sec_app_revol_util 48646 0.8742047945944
sec_app_open_act_il 48503 0.8716349782554
sec_app_num_rev_accts 48503 0.8716349782554
sec_app_mths_since_last_major_derog 53403 0.95969162203932
hardship_flag 0 0
hardship_type 0 0
hardship_reason 0 0
hardship_status 0 0
deferral_term 49429 0.888275886856198
hardship_start_date 0 0
hardship_length 49429 0.888275886856198
hardship_loan_status 0 0
orig_projected_additional_accrued_interest 42743 0.768123494950221
hardship_payoff_balance_amount 42743 0.768123494950221
hardship_last_payment_amount 42743 0.768123494950221
settlement_amount 55576 0.998742047945944
settlement_percentage 55576 0.998742047945944
settlement_term 55576 0.998742047945944
# Histogram of Feature Incompleteness
hist(per, main="Histogram of Feature Incompleteness", xlab="missing fraction")

# drop columns that missing more than 50% record
data_drop = data_wo_nzv %>% select (-colnames(data_wo_nzv)[per>0.5])
cat("drop columns are:\n", colnames(data_wo_nzv)[per>0.5])
## drop columns are:
##  mths_since_last_delinq mths_since_last_record mths_since_last_major_derog mths_since_recent_bc_dlq mths_since_recent_revol_delinq revol_bal_joint sec_app_fico_range_low sec_app_fico_range_high sec_app_inq_last_6mths sec_app_mort_acc sec_app_open_acc sec_app_revol_util sec_app_open_act_il sec_app_num_rev_accts sec_app_mths_since_last_major_derog deferral_term hardship_length orig_projected_additional_accrued_interest hardship_payoff_balance_amount hardship_last_payment_amount settlement_amount settlement_percentage settlement_term

We have dropped another23 features.

drop columns that don’t need

We want to remove noise features, like url, desc, etc. Also, We want to focus on the information that customers provide to Lending club when they apply the loan to predict the default. We will ignore all features that generated after the loan is approved.

# check current data before futhur feature selection
data_drop[1:10,] %>% kbl() %>% kable_styling( font_size = 12) %>%scroll_box(width = "100%", height="300px", fixed_thead=T )
id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status url purpose title zip_code addr_state dti delinq_2yrs earliest_cr_line fico_range_low fico_range_high inq_last_6mths open_acc pub_rec revol_bal revol_util total_acc initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d last_fico_range_high last_fico_range_low application_type tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m mths_since_rcnt_il total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal bc_open_to_buy bc_util mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl mort_acc mths_since_recent_bc mths_since_recent_inq num_accts_ever_120_pd num_actv_bc_tl num_actv_rev_tl num_bc_sats num_bc_tl num_il_tl num_op_rev_tl num_rev_accts num_rev_tl_bal_gt_0 num_sats num_tl_op_past_12m percent_bc_gt_75 pub_rec_bankruptcies tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit hardship_flag hardship_type hardship_reason hardship_status hardship_start_date hardship_loan_status
158332903 8000 8000 8000 36 months 10.33% 259.38 B B1 Processing Manager 6 years MORTGAGE 116600 3 Sep-19 Fully_Paid https://lendingclub.com/browse/loanDetail.action?loan_id=158332903 debt_consolidation Debt consolidation 982xx WA 36.48 0 Mar-05 700 704 1 19 0 67350 55% 34 w 0.00 0.00 8144.882 8144.88 8000.00 144.88 Dec-19 7890.09 Aug-20 704 700 Individual 137633 1 4 3 4 2 70283 74 0 1 19425 64 122400 3 6 3 5 7244 31138 47.5 158 174 13 2 0 13 2 0 4 8 7 10 11 15 23 8 19 3 42.9 0 216889 137633 59300 94489 N
159383484 25000 25000 25000 36 months 28.80% 1044.93 D D5 traffic cordinator 10+ years MORTGAGE 80000 3 Sep-19 Fully_Paid https://lendingclub.com/browse/loanDetail.action?loan_id=159383484 debt_consolidation Debt consolidation 351xx AL 18.15 0 Sep-08 695 699 0 6 0 43105 85.90% 30 w 0.00 0.00 25320.000 25320.00 25000.00 320.00 Oct-19 25360.00 Jan-20 744 740 Individual 131078 1 1 0 1 21 26318 76 1 2 2988 82 50200 1 12 0 3 21846 7212 29.3 120 132 3 3 3 17 21 0 1 3 2 5 17 4 10 3 6 1 50.0 0 157774 69423 10200 34574 N
159523765 15950 15950 15950 36 months 28.80% 666.67 D D5 < 1 year RENT 125000 3 Sep-19 Charged_off https://lendingclub.com/browse/loanDetail.action?loan_id=159523765 debt_consolidation Debt consolidation 331xx FL 20.75 0 May-06 680 684 0 11 0 32528 47.30% 45 w 0.00 0.00 1891.150 1891.15 0.00 0.00 0.00 Apr-20 544 540 Individual 50854 1 1 0 2 13 18326 64 5 5 14173 52 68800 2 1 4 8 4623 29864 51.7 91 160 4 4 1 4 7 0 6 7 9 27 9 10 33 7 11 6 33.3 0 97398 50854 61800 28598 N
159568503 9900 9900 9900 36 months 8.19% 311.10 A A4 Teacher 9 years MORTGAGE 333000 1 Sep-19 Fully_Paid https://lendingclub.com/browse/loanDetail.action?loan_id=159568503 credit_card Credit card refinancing 212xx MD 6.38 0 Sep-04 715 719 0 11 0 18633 54% 30 w 0.00 0.00 10203.490 10203.49 9900.00 303.49 Feb-20 8963.59 Mar-20 674 670 Joint App 503648 1 3 1 2 11 84319 74 2 3 5847 57 34500 0 3 0 5 45786 8967 65.8 168 180 5 5 1 5 16 4 5 6 5 6 16 7 13 6 11 3 60.0 0 539572 102952 26200 87770 N
159216296 16000 16000 16000 60 months 8.19% 325.88 A A4 Business Agent 2 years MORTGAGE 116000 2 Sep-19 Fully_Paid https://lendingclub.com/browse/loanDetail.action?loan_id=159216296 credit_card Credit card refinancing 554xx MN 15.69 0 Feb-04 720 724 0 25 0 44990 20.70% 68 w 0.00 0.00 16417.115 16417.12 16000.00 417.12 Dec-19 15772.64 Jan-20 764 760 Individual 53953 1 2 0 0 31 8963 30 2 5 10900 22 217000 0 4 2 6 2158 171110 19.3 140 187 9 2 4 9 3 0 13 14 22 32 20 23 44 14 25 3 0.0 0 246763 53953 212000 29763 N
159602788 15950 15950 15950 36 months 20.55% 597.24 D D2 Project Manager 8 years MORTGAGE 105000 3 Sep-19 Charged_off https://lendingclub.com/browse/loanDetail.action?loan_id=159602788 credit_card Credit card refinancing 776xx TX 27.61 0 Sep-06 710 714 0 10 0 11978 22.80% 38 w 0.00 0.00 2967.990 2967.99 1685.29 1282.70 Feb-20 597.24 Aug-20 554 550 Individual 288166 0 3 0 3 18 114203 NA 1 3 9489 23 52500 0 0 5 6 28817 40522 22.8 156 78 9 9 2 9 1 0 5 5 6 7 25 6 11 5 10 1 33.3 0 362222 126181 52500 138559 N
159026517 4200 4200 4200 36 months 20.55% 157.27 D D2 Rural Carrier associate 3 years MORTGAGE 44500 4 Sep-19 Fully_Paid https://lendingclub.com/browse/loanDetail.action?loan_id=159026517 debt_consolidation Debt consolidation 760xx TX 35.30 0 Jun-03 670 674 1 16 0 15665 56.10% 22 w 0.00 0.00 4670.050 4670.05 4200.00 470.05 May-20 3731.23 May-20 674 670 Individual 133206 2 4 2 3 10 17370 58 3 4 4555 57 27900 2 0 4 7 8325 3483 79.6 82 81 3 3 1 3 2 0 6 10 6 8 6 11 15 10 16 5 66.7 0 197254 33035 17100 29979
158884774 27200 27200 27200 60 months 15.24% 650.52 C C2 Dentist 1 year RENT 150000 3 Sep-19 Charged_off https://lendingclub.com/browse/loanDetail.action?loan_id=158884774 credit_card Credit card refinancing 029xx RI 23.66 0 Feb-05 695 699 0 12 0 31561 51.70% 27 w 0.00 0.00 2611.580 2611.58 1238.32 1340.73 Mar-20 0.00 Aug-20 564 560 Individual 863114 3 7 2 2 3 831553 92 1 1 29624 68 61000 0 0 0 3 86311 29439 51.7 175 159 4 3 0 4 NA 0 2 2 3 4 23 3 4 2 12 3 66.7 0 724574 863114 61000 663574 N
159598770 5250 5250 5250 36 months 7.56% 163.46 A A3 Product Manager 6 years RENT 96000 2 Sep-19 Fully_Paid https://lendingclub.com/browse/loanDetail.action?loan_id=159598770 debt_consolidation Debt consolidation 275xx NC 2.09 0 Apr-10 720 724 0 6 0 3012 5.70% 16 w 0.00 0.00 5376.050 5376.05 5250.00 126.05 Feb-20 3887.87 Aug-20 724 720 Individual 139796 0 0 1 1 8 0 NA 3 3 1461 6 52500 1 0 2 4 23299 36039 3.9 56 113 11 8 1 12 8 0 1 2 2 6 2 5 13 2 6 4 0.0 0 214511 3012 37500 0 N
158744038 26000 26000 26000 36 months 15.24% 904.36 C C2 Security guard < 1 year RENT 89000 3 Sep-19 Charged_off https://lendingclub.com/browse/loanDetail.action?loan_id=158744038 debt_consolidation Debt consolidation 114xx NY 7.42 0 Sep-08 750 754 1 7 0 0 0% 15 w 22443.41 22443.41 5404.150 5404.15 3556.59 1847.56 May-20 0.00 Sep-20 Aug-20 639 635 Joint App 22251 1 2 2 3 5 22251 92 1 2 0 51 19200 2 0 2 5 3179 18000 0.0 16 132 11 5 0 24 5 0 0 0 4 8 3 5 12 0 7 3 0.0 0 43300 22251 18000 24100 N CVD19SKIP INCOMECURT COMPLETE Apr-20 ACTIVE
columns_keep = c('loan_amnt','term','sub_grade',"int_rate",'emp_length','home_ownership','annual_inc','verification_status','purpose','addr_state','dti','delinq_2yrs','earliest_cr_line','fico_range_low','fico_range_high', "inq_last_6mths", 'open_acc','pub_rec','total_acc',"initial_list_status","application_type","all_util","bc_util",'revol_util','mort_acc', 'num_accts_ever_120_pd', "percent_bc_gt_75", 'pub_rec_bankruptcies',"num_tl_op_past_12m","num_actv_rev_tl",'loan_status')  

LC = data_drop %>%select(all_of(columns_keep))
col_drops = data_drop %>%select(-all_of(columns_keep))%>%colnames
cat("columns that droped: \n", col_drops)
## columns that droped: 
##  id funded_amnt funded_amnt_inv installment grade emp_title issue_d url title zip_code revol_bal out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d last_fico_range_high last_fico_range_low tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m mths_since_rcnt_il total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc total_rev_hi_lim inq_fi total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal bc_open_to_buy mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl mths_since_recent_bc mths_since_recent_inq num_actv_bc_tl num_bc_sats num_bc_tl num_il_tl num_op_rev_tl num_rev_accts num_rev_tl_bal_gt_0 num_sats tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit hardship_flag hardship_type hardship_reason hardship_status hardship_start_date hardship_loan_status
# remaining features
LC[1:10,] %>% kbl() %>% kable_styling( font_size = 12) %>%scroll_box(width = "100%", height="300px", fixed_thead=T )
loan_amnt term sub_grade int_rate emp_length home_ownership annual_inc verification_status purpose addr_state dti delinq_2yrs earliest_cr_line fico_range_low fico_range_high inq_last_6mths open_acc pub_rec total_acc initial_list_status application_type all_util bc_util revol_util mort_acc num_accts_ever_120_pd percent_bc_gt_75 pub_rec_bankruptcies num_tl_op_past_12m num_actv_rev_tl loan_status
8000 36 months B1 10.33% 6 years MORTGAGE 116600 3 debt_consolidation WA 36.48 0 Mar-05 700 704 1 19 0 34 w Individual 64 47.5 55% 0 0 42.9 0 3 8 Fully_Paid
25000 36 months D5 28.80% 10+ years MORTGAGE 80000 3 debt_consolidation AL 18.15 0 Sep-08 695 699 0 6 0 30 w Individual 82 29.3 85.90% 3 0 50.0 0 1 3 Fully_Paid
15950 36 months D5 28.80% < 1 year RENT 125000 3 debt_consolidation FL 20.75 0 May-06 680 684 0 11 0 45 w Individual 52 51.7 47.30% 1 0 33.3 0 6 7 Charged_off
9900 36 months A4 8.19% 9 years MORTGAGE 333000 1 credit_card MD 6.38 0 Sep-04 715 719 0 11 0 30 w Joint App 57 65.8 54% 1 4 60.0 0 3 6 Fully_Paid
16000 60 months A4 8.19% 2 years MORTGAGE 116000 2 credit_card MN 15.69 0 Feb-04 720 724 0 25 0 68 w Individual 22 19.3 20.70% 4 0 0.0 0 3 14 Fully_Paid
15950 36 months D2 20.55% 8 years MORTGAGE 105000 3 credit_card TX 27.61 0 Sep-06 710 714 0 10 0 38 w Individual 23 22.8 22.80% 2 0 33.3 0 1 5 Charged_off
4200 36 months D2 20.55% 3 years MORTGAGE 44500 4 debt_consolidation TX 35.30 0 Jun-03 670 674 1 16 0 22 w Individual 57 79.6 56.10% 1 0 66.7 0 5 10 Fully_Paid
27200 60 months C2 15.24% 1 year RENT 150000 3 credit_card RI 23.66 0 Feb-05 695 699 0 12 0 27 w Individual 68 51.7 51.70% 0 0 66.7 0 3 2 Charged_off
5250 36 months A3 7.56% 6 years RENT 96000 2 debt_consolidation NC 2.09 0 Apr-10 720 724 0 6 0 16 w Individual 6 3.9 5.70% 1 0 0.0 0 4 2 Fully_Paid
26000 36 months C2 15.24% < 1 year RENT 89000 3 debt_consolidation NY 7.42 0 Sep-08 750 754 1 7 0 15 w Joint App 51 0.0 0% 0 0 0.0 0 3 0 Charged_off

cleaning data with NA

# check missing data again
fraction=c()
for(i in 1:dim(LC)[2]){
  temp = sum(is.na(LC[,i]))
  perc = temp/dim(LC)[1]
  fraction=rbind(fraction,c(colnames(LC)[i], temp, perc))
}
fraction = as.data.frame(fraction)
colnames(fraction)=c("name","count","missing_fraction")
fraction%>%kbl() %>% kable_styling( font_size = 12) %>%scroll_box(width = "100%", height="300px", fixed_thead=T )
name count missing_fraction
loan_amnt 0 0
term 0 0
sub_grade 0 0
int_rate 0 0
emp_length 0 0
home_ownership 0 0
annual_inc 0 0
verification_status 0 0
purpose 0 0
addr_state 0 0
dti 0 0
delinq_2yrs 0 0
earliest_cr_line 0 0
fico_range_low 0 0
fico_range_high 0 0
inq_last_6mths 0 0
open_acc 0 0
pub_rec 0 0
total_acc 0 0
initial_list_status 0 0
application_type 0 0
all_util 20 0.000359414872587428
bc_util 908 0.0163174352154692
revol_util 0 0
mort_acc 0 0
num_accts_ever_120_pd 0 0
percent_bc_gt_75 874 0.0157064299320706
pub_rec_bankruptcies 0 0
num_tl_op_past_12m 0 0
num_actv_rev_tl 0 0
loan_status 0 0
# As we have 55671 rows,  we can just drop all records with missing value 
LC = LC%>%drop_na
as.data.frame(table(LC$loan_status))
##          Var1  Freq
## 1 Charged_off 15178
## 2  Fully_Paid 39558

data transformation

earliest_cr_line is the month the borrower’s earliest reported credit line was opened. we want to convert it to a numeric value that show how many years that the borrower own credit line since the first one is opened.

# convert the earliest_cr_line to credit history in year
convert_year <- function(x, year=1940){
  m <- year(x) %% 100
  year(x) <- ifelse(m > year %% 100, 1900+m, 2000+m)
  x
}

LC = LC%>%mutate(cr_year = earliest_cr_line %>% 
                                         paste0("01-",.) %>%
                                         as.Date(format="%d-%B-%y") %>% 
                                         convert_year () %>% 
                                         difftime(Sys.Date(), .) %>% 
                                         time_length("years") %>%
                                         round(0)) %>% select (-earliest_cr_line)

take the average of fico score high and low.

# average fico score high and low and assign it to fico_score
LC = LC %>% mutate(fico_score = 0.5*fico_range_low +0.5*fico_range_high) %>%
                   select(-c(fico_range_low,fico_range_high))

correct the data type

table(sapply(LC[1,], class))
## 
## character    factor   integer   numeric 
##         1        10        13         6
cat("Character columns: ", LC %>% select_if(is.character) %>% colnames, "\n")
## Character columns:  loan_status
# Convert loan_status to Factor
LC$loan_status = as.factor(LC$loan_status)
cat("Factor columns: ", LC %>% select_if(is.factor )%>% colnames, "\n")
## Factor columns:  term sub_grade int_rate emp_length home_ownership purpose addr_state initial_list_status application_type revol_util loan_status
# Convert int_rate should to Numeric
LC$int_rate = as.numeric(sub("%","",LC$int_rate))/100

# Convert revol_util should to Numeric
LC$revol_util = as.numeric(sub("%","",LC$revol_util))/100

# substract year from emp_length
LC$emp_length = sub("years","",LC$emp_length)
LC$emp_length = as.factor(sub("year","",LC$emp_length))

cat("Numeric columns: ", LC %>% select_if(is.numeric) %>% colnames,"\n")
## Numeric columns:  loan_amnt int_rate annual_inc verification_status dti delinq_2yrs inq_last_6mths open_acc pub_rec total_acc all_util bc_util revol_util mort_acc num_accts_ever_120_pd percent_bc_gt_75 pub_rec_bankruptcies num_tl_op_past_12m num_actv_rev_tl cr_year fico_score
# Convert Verification status to Factor
LC$verification_status = as.factor(LC$verification_status)

# Convert dti, all_util, bc_util, percent_bc_gt_75 into decimal
LC$dti = LC$dti/100
LC$all_util = LC$all_util/100
LC$bc_util = LC$bc_util/100
LC$percent_bc_gt_75 = LC$percent_bc_gt_75/100

Checking the data type again after converting.

# Check the class type after convert 
table(sapply(LC[1,], class))
## 
##  factor integer numeric 
##      10      11       9
# check any n/a value 
anyNA(LC)
## [1] FALSE

Exploratory Data Analysis (EDA)

Factors

# term
term_bar = ggplot(data=LC, aes(x = term, fill= loan_status))+geom_bar( position = 'fill') #+ theme(legend.position = "top")

# sub_grade
Subgrade_bar = ggplot(data=LC, aes(x = sub_grade, fill= loan_status))+geom_bar(position = "fill")# + theme(legend.position = "top")

# emp_length
emp_length_bar = ggplot(data=LC, aes(y = emp_length, fill= loan_status))+geom_bar(position = "fill") + theme(legend.position = "top")

# home ownership
home_ownership_bar = ggplot(data=LC, aes(y = home_ownership, fill= loan_status))+geom_bar(position = "fill") #+ theme(legend.position = "top")

# verifcation_status
verification_status_bar = ggplot(data=LC, aes(x = verification_status, fill= loan_status))+geom_bar(position = "fill") #+ theme(legend.position = "top")

#purpose
purpose_bar = ggplot(data=LC, aes(y = purpose, fill= loan_status))+geom_bar(position = "fill")  + theme(legend.position = "top")

# initial_list_status
initial_list_status_bar = ggplot(data=LC, aes(x = initial_list_status, fill= loan_status))+geom_bar(position = "fill") #+ theme(legend.position = "top")

# application_type
application_type_bar = ggplot(data=LC, aes(x = application_type, fill= loan_status))+geom_bar(position = "fill") #+ theme(legend.position = "top")


figure.1 <- grid.arrange(term_bar, verification_status_bar,home_ownership_bar,initial_list_status_bar,application_type_bar,ncol =2) 

figure.2 <- grid.arrange(Subgrade_bar,  ncol = 1)

figure.3 <- grid.arrange( emp_length_bar,purpose_bar,  ncol = 1)

Charge off Rate by state.

# bar plot  based on addr_state
ggplot(data = LC, aes(y=addr_state, fill=loan_status)) + geom_bar(position="fill")

# Group the state and calculate the percentage of the default loan in each state
summary <- LC %>%
  group_by(addr_state) %>%
  summarise(percentage = (sum(loan_status == "Charged_off")/(sum(loan_status == "Charged_off")+sum(loan_status == "Fully_Paid"))))

summary = summary[c(2,1,3:50),]



library(usmap)

summary$full = summary$addr_state
summary$full = as.factor(summary$full)
levels(summary$full) = c("","Alaska","Alabama","Arkansas", "Arizona", "California","Colorado","Connecticut","District of Columbia","Delaware","Florida","Georgia","Hawaii","Iowa","Idaho","Illinois","Indiana","Kansas","Kentucky","Louisiana","Massachusetts","Maryland","Maine","Michigan","Minnesota","Missouri","Mississippi","Montana","North Carolina","Nebraska","New Hampshire","New Jersey","New Mexico","Nevada","New York","Ohio","Oklahoma","Oregon","Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah","Virginia","Vermont","Washington","Wisconsin","West Virginia","Wyoming","")
us_states = usmap::us_map("state")

loanstatusperstate = merge(us_states, summary, by="full")[,c(8,11)]

plot_usmap(data=loanstatusperstate, labels = TRUE, values="percentage") + theme(legend.position = "right") + scale_fill_continuous(name = "Default Loan Percentage") + labs(title = "Default Loan Percentage by States")

check the relationship between purpose and loan_amnt

# check the relationship between purpose and loan_amnt
purpose_box=ggplot(data=LC, aes( y=purpose, x=loan_amnt,fill=loan_status)) + geom_boxplot()
figure.purpose <- grid.arrange( purpose_bar,purpose_box,  ncol = 1)

checking the correlation between_grade and int_rate

# checking the correlation between_grade and int_rate
ggplot(data=LC, aes( y=sub_grade, x=int_rate,)) + geom_boxplot()

# remove the outliers at D1 and D2 with interest rate lower than 0.1
LC = LC[!(LC$sub_grade%in% c("D1","D2") & LC$int_rate < 0.1),]


table(LC$emp_length)
## 
##  < 1     1   10+     2     3     4     5     6     7     8     9    n/a 
##  6709  3669 16906  4669  4152  3206  3582  2448  1876  1701  1362  4451

initial_list_status have low relationship to loan status, we can drop this features.
grade has linear relation to the loan status, I will convert it to numeric 1-20, A1 is the highest 20, D5 is lowest 1.
I also convert the employment length into numeric

Drop andTransform factor features

# drop initial_list_status
LC = LC %>% select(-c(initial_list_status))

LC.data = LC %>% mutate(sub_grade = case_when(sub_grade == "A1" ~ 20,
                                     sub_grade == "A2" ~ 19,
                                     sub_grade == "A3" ~ 18,
                                     sub_grade == "A4" ~ 17,
                                     sub_grade == "A5" ~ 16,
                                     sub_grade == "B1" ~ 15,
                                     sub_grade == "B2" ~ 14,
                                     sub_grade == "B3" ~ 13,
                                     sub_grade == "B4" ~ 12,
                                     sub_grade == "B5" ~ 11,
                                     sub_grade == "C1" ~ 10,
                                     sub_grade == "C2" ~ 9,
                                     sub_grade == "C3" ~ 8,
                                     sub_grade == "C4" ~ 7,
                                     sub_grade == "C5" ~ 6,
                                     sub_grade == "D1" ~ 5,
                                     sub_grade == "D2" ~ 4,
                                     sub_grade == "D3" ~ 3,
                                     sub_grade == "D4" ~ 2,
                                     sub_grade == "D5" ~ 1,
                                     ),
                   #purpose = as.factor(case_when(purpose %in% c("credit_card", "debt_consolidation", "small_business") ~ "Finance",
                  #                     purpose %in% c("home_improvement", "house") ~ "house_related",
                   #                    purpose %in% c("renewable_energy", "medical","moving") ~ "house_related",
                    #                   purpose %in% c("car", "vacation","other", "major_purchase") ~ "other"
                     #                  )),
                   emp_length  = as.numeric(case_when(emp_length == "10+ " ~ "10",
                                           emp_length == "< 1 " ~ "0.5",
                                           emp_length == "n/a" ~ "0",
                                           TRUE ~ as.character(emp_length))))
                                          



emp_length_bar = ggplot(data=LC.data, aes(y = emp_length, fill= loan_status))+geom_bar(position = "fill") + theme(legend.position = "top")

Quantitive futures correlation

anyNA(LC.data)
## [1] FALSE
# check correlation between all quantitive futures
loanQuanCorr = cor(LC.data %>% select_if(is.numeric))
corrplot(loanQuanCorr, order ='hclust',type="upper", tl.cex = 0.8)

(highCorr = findCorrelation(loanQuanCorr, 0.7, verbose=T, names =T))
## Compare row 14  and column  13 with corr  0.907 
##   Means:  0.24 vs 0.13 so flagging column 14 
## Compare row 13  and column  17 with corr  0.844 
##   Means:  0.2 vs 0.122 so flagging column 13 
## Compare row 2  and column  3 with corr  0.975 
##   Means:  0.169 vs 0.114 so flagging column 2 
## Compare row 11  and column  9 with corr  0.709 
##   Means:  0.196 vs 0.107 so flagging column 11 
## Compare row 10  and column  18 with corr  0.999 
##   Means:  0.101 vs 0.1 so flagging column 10 
## All correlations <= 0.7
## [1] "revol_util" "bc_util"    "sub_grade"  "total_acc"  "pub_rec"
# remove features that  correlation greater than 0.7 

LC.data = LC.data %>% select(-c(pub_rec, revol_util, bc_util, sub_grade, total_acc))

remove pub_rec, revol_util, bc_util, sub_grade, total_acc

Quantitive futures visuallization

checking numeric features

Loan_amount

Loan_amount_den=ggplot(data=LC.data) + geom_density(mapping= aes(x = loan_amnt,fill=loan_status), alpha=0.7) + theme(legend.position = "top")
loan_amount_box = ggplot(data=LC.data, aes( x=loan_status, y=loan_amnt,)) + geom_boxplot()
figure.4 <- grid.arrange( Loan_amount_den,loan_amount_box,  ncol = 2)

Charged_off customer usually have higher loan amount.

interest rate

int_rate_den = ggplot(data=LC.data) + geom_density(mapping= aes(x = int_rate,fill=loan_status), alpha =0.7)
int_rate_box = ggplot(data=LC.data, aes( x=loan_status, y=int_rate,)) + geom_boxplot()
figure.5 <- grid.arrange( int_rate_den,int_rate_box,  ncol = 2)

# there are few outliers for customer who fully paid
LC.data = LC.data %>% filter(int_rate <=0.3 )

higher interest rate loan have higher chance turn into charged off loan.
There are few outliers, remove it.

annual income

annual_inc_den = ggplot(data=LC.data) + geom_density(mapping= aes(x = annual_inc,fill=loan_status), alpha =0.7)+ theme(legend.position = "top")
annual_inc_box = ggplot(data=LC.data, aes( x=loan_status, y=annual_inc,)) + geom_boxplot()
figure.income <- grid.arrange( annual_inc_den,annual_inc_box,  ncol = 2)

summary(LC.data$annual_inc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4000   55000   80000   98866  120000 9100000
quantile(LC.data$annual_inc,c(0.95, 0.975))
##      95%    97.5% 
## 213759.5 265000.0
#  lots of outliers, remove top 2.5% customer
LC.data = LC.data %>% filter(annual_inc < 189000  )

# After remove outliers
inc_den = ggplot(data=LC.data) + geom_density(mapping= aes(x = annual_inc,fill=loan_status), alpha =0.7)+ theme(legend.position = "top")
inc_box = ggplot(data=LC.data, aes( x=loan_status, y=annual_inc,)) + geom_boxplot() + ggtitle("After Remove Annual Income Outlier")
figure.income_after <- grid.arrange( inc_den,inc_box,  ncol = 2) 

There are few outliers, annual income is over millions, I remove top 5% quantile of annua income Higher income customer may have slightly higher chance paid off the loan

dti

A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.

dti_den = ggplot(data=LC.data) + geom_density(mapping= aes(x = dti,fill=loan_status), alpha=.5)+ theme(legend.position = "top")
dti_box = ggplot(data=LC.data, aes( x=loan_status, y=dti,)) + geom_boxplot()
figure.7 <- grid.arrange( dti_den,dti_box,  ncol = 2)

Charged Off customer usually have higher dti

all_util

Balance to credit limit on all trades

all_util_bar = ggplot(data=LC.data) + geom_density(mapping= aes(x = all_util,fill=loan_status), alpha=0.7)+ theme(legend.position = "top")
all_util_box = ggplot(data=LC.data, aes( x=loan_status, y=all_util,)) + geom_boxplot()
figure.8 <- grid.arrange( all_util_bar,all_util_box,  ncol = 2)

summary(LC.data$all_util)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3900  0.5500  0.5365  0.6900  1.8500
quantile(LC.data$all_util, 0.975)
## 97.5% 
##  0.93
#LC.data = LC.data %>% filter (all_util<=1.2)

fico_score

fico_density =ggplot(data=LC.data) + geom_density(mapping= aes(x = fico_score,fill=loan_status), alpha = 0.5)+ theme(legend.position = "top")
fico_box =ggplot(data=LC.data, aes( x= loan_status, y=fico_score,)) + geom_boxplot() 
figure.fico<- grid.arrange( fico_density,fico_box,  ncol = 2)

####cr_year year of borrower’s earliest reported credit line was opened

cr_density=ggplot(data=LC.data) + geom_density(mapping= aes(x = cr_year,fill=loan_status), alpha = 0.5)+ theme(legend.position = "top")
cr_box =ggplot(data=LC.data, aes( x= loan_status, y=cr_year,)) + geom_boxplot() 
figure.fico<- grid.arrange( cr_density,cr_box,  ncol = 2)

summary(LC.data$cr_year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   12.00   16.00   17.07   20.00   77.00
quantile(LC.data$cr_year, 0.975)
## 97.5% 
##    36
LC.data = LC.data %>% filter(LC.data$cr_year<=50)

##percent_bc_gt_75

bc_gt_75_density =ggplot(data=LC.data) + geom_density(mapping= aes(x = percent_bc_gt_75,fill=loan_status), position="dodge", alpha=0.7)+ theme(legend.position = "top")
bc_gt_75_box =ggplot(data=LC.data, aes( x= loan_status, y=percent_bc_gt_75,)) + geom_boxplot() 

figure.bc_gt_75 <- grid.arrange( bc_gt_75_density,bc_gt_75_box,  ncol = 2)
## Warning: Width not defined. Set with `position_dodge(width = ?)`

open_acc

open_acc.density = ggplot(data=LC.data) + geom_density(mapping= aes(x = open_acc,fill=loan_status), alpha=.5)+ theme(legend.position = "top")
open_acc.box = ggplot(data=LC.data, aes(x = loan_status, y= open_acc)) + geom_boxplot()
figure.open_acc=grid.arrange( open_acc.density,open_acc.box,  ncol = 2)

summary(LC.data$open_acc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    8.00   11.00   11.78   15.00   67.00
quantile(LC.data$open_acc, .975)
## 97.5% 
##    27
# remove top 5% quantile open_acc
LC.data = LC.data%>%filter(open_acc<=27)

num_actv_rev_tl

num_actv.density = ggplot(data=LC.data) + geom_bar(mapping= aes(x = num_actv_rev_tl,fill=loan_status), position="dodge")+ theme(legend.position = "top")
num_actv.box = ggplot(data=LC.data, aes(x = loan_status, y= num_actv_rev_tl)) + geom_boxplot()
figure.actv=grid.arrange( num_actv.density,num_actv.box,  ncol = 2)

plot(LC$open_acc, LC$num_actv_rev_tl)

We can remove open_acc

###delinq_2yrs The number of 30+ days past-due incidences of delinquency in the borrower’s credit file for the past 2 years

ggplot(data=LC.data) + geom_bar(mapping= aes(x = delinq_2yrs, fill=loan_status), position="dodge")

# group more than 1 delinquency into one group
LC.data = LC.data %>% mutate(delinq_2yrs = case_when(delinq_2yrs >=1 ~ 1,
                                                      delinq_2yrs ==0 ~ 0))

ggplot(data=LC.data) + geom_bar(mapping= aes(x = delinq_2yrs, fill=loan_status), position="fill") + ggtitle("deling_2yrs After Grouping")

After grouping , there is no pattern that number of delinquency for the past 2 year would impact the prediction.

##inq_last_6mths The number of inquiries in past 6 months (excluding auto and mortgage inquiries)

#inq_last_6mths
ggplot(data=LC.data) + geom_bar(mapping= aes(x = inq_last_6mths,fill=loan_status), position = "dodge")+ theme(legend.position = "top")

LC.data= LC.data %>% mutate(inq_last_6mths = case_when(inq_last_6mths >=1 ~ 1,
                                                       inq_last_6mths ==0 ~ 0))

ggplot(data=LC.data) + geom_bar(mapping= aes(x = inq_last_6mths,fill=loan_status), position ="fill")+ theme(legend.position = "top")

pub_rec_bankruptcies

Number of derogatory public records

ggplot(data=LC.data) + geom_histogram(mapping= aes(x = pub_rec_bankruptcies,fill=loan_status), position="dodge")+ theme(legend.position = "top")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

LC.data= LC.data %>% mutate(pub_rec_bankruptcies = case_when(pub_rec_bankruptcies >=1 ~ 1,
                                                             pub_rec_bankruptcies ==0 ~ 0))

ggplot(data=LC.data) + geom_bar(mapping= aes(x = pub_rec_bankruptcies,fill=loan_status), position ="fill") + ggtitle("After Grouping (pub_rec_bankruptcies)")

pub_rec is not important.

mort_acc

Number of mortgage accounts.

mort_bar=ggplot(data=LC.data) + geom_bar(mapping= aes(x = mort_acc,fill=loan_status), postiion="dodge")+ theme(legend.position = "top")
## Warning: Ignoring unknown parameters: postiion
mort_box=ggplot(data=LC.data, aes( x= loan_status, y=mort_acc,)) + geom_boxplot() 
mort_ownship_box =ggplot(data=LC.data, aes( x =home_ownership, y=mort_acc,fill=loan_status)) + geom_boxplot(position ="dodge")
figure.mort_acc <- grid.arrange( mort_bar,mort_box, mort_ownship_box, ncol = 2)

# remove mort_acc greater than 8
summary(LC.data$mort_acc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.415   2.000  23.000
quantile(LC.data$mort_acc, 0.95)
## 95% 
##   5
#LC.data = LC.data %>% filter(LC.data$mort_acc<8)

##num_accts_ever_120_pd Number of accounts ever 120 or more days past due

ggplot(data=LC.data) + geom_bar(mapping= aes(x = num_accts_ever_120_pd,fill=loan_status))+ theme(legend.position = "top")

LC.data= LC.data %>% mutate(num_accts_ever_120_pd = case_when(num_accts_ever_120_pd >=1 ~ 1,
                                                              num_accts_ever_120_pd <1 ~ 0))

ggplot(data=LC.data) + geom_bar(mapping= aes(x = num_accts_ever_120_pd,fill=loan_status), position ="fill")+ theme(legend.position = "top")

num_tl_op_past_12m

Number of accounts opened in past 12 months

tl_open_12m_bar =ggplot(data=LC.data) + geom_bar(mapping= aes(x = num_tl_op_past_12m,fill=loan_status), position="dodge")+ theme(legend.position = "top")
tl_open_12m_box = ggplot(data=LC.data, aes( x= loan_status, y=num_tl_op_past_12m,)) + geom_boxplot() 

figure.tl_open <- grid.arrange( tl_open_12m_bar,tl_open_12m_box, ncol = 2)

LC.data = LC.data %>% filter(num_tl_op_past_12m <= 10)

remove unrelated features

LC.data = LC.data %>% select(-c(open_acc, delinq_2yrs, inq_last_6mths, pub_rec_bankruptcies))

check correlation again

loanQuanCorr = cor(LC.data %>% select_if(is.numeric))
corrplot(loanQuanCorr, order ='hclust',type="upper", tl.cex = 0.8)

(highCorr = findCorrelation(loanQuanCorr, 0.7, verbose=T, names =T))
## All correlations <= 0.7
## character(0)