Introduction

Lending Club is a US peer-to-peer lending company and the world’s largest peer-to-peer lending platform. As explained by Wikipedia.

The goal of this Project is using the data from Lending club (2019Q3 to 2020Q2), conducting as set of explorator analysis and applying multiple machine learning algorithm techniques to predict the customer default rate.

Data files contain complete loan data for all loans issued through the time period stated, including the current loan status (Current, Late, Fully Paid, etc.) and latest payment information.
Based on the goal, we will only focus on the records with loan status Fully Paid, Charged off, Default and Late. We will remove the records that not in these status. and group Charged off, Default and Late into one call Default.

Reading in the data

setwd("/Users/huitingsheng/Downloads/Github/Lending_Club_EDA_and_Modeling")
data1 = read.csv("LoanStats_securev1_2019Q3.csv")
data2 = read.csv("LoanStats_securev1_2019Q4.csv")
data3 = read.csv("LoanStats_securev1_2020Q1.csv")
data4 = read.csv("LoanStats_securev1_2020Q2.csv")
data = rbind(data1,data2,data3, data4)
dim(data)

## [1] 389627    150

Installing/Loading Packages

packages=c("caret","ggplot2", "tidyverse", "dplyr", "corrplot","e1071", "reshape2","lubridate","usmap", "glmnet", "pROC","doParallel", "ranger","lattice","gridExtra", "kableExtra", "ROSE", "DMwR")
# Now load or install & load all
package.check <- lapply(packages,FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x, character.only = TRUE)
    }
  }
)

Remove all records that loan Status is Current and Issued

Our goal is the predict the default rate of customers,so before we move forward to clean up the data. Let’s combine loan status. We only want records that either fully paid, or default, charged off,or late. Default, Charged off and Late are consider as Risky Customer that we want to prevent.
As we don’t know whether the current customer and issued customer will be fully paid or turn into charged off. so we remove all the current and issued records.and combine Default, Charged off and Late into a group called Charged_off

df = data
as.data.frame(table(df$loan_status))

##                 Var1   Freq
## 1                        16
## 2        Charged Off   6387
## 3            Current 333956
## 4            Default     67
## 5         Fully Paid  40207
## 6    In Grace Period   4070
## 7  Late (16-30 days)   1384
## 8 Late (31-120 days)   3531
## 9             Issued      9

ggplot(data=df,aes(y=loan_status)) + geom_bar()

# remove all current records
df = df %>%
  filter(loan_status !="Current" & loan_status !="Issued" & loan_status !="") %>% 
  mutate(loan_status = if_else(loan_status %in% 
                        c("Fully Paid"," Does not meet the credit policy. Status:Fully Paid "), 
                        "Fully_Paid","Charged_off"))

as.data.frame(table(df$loan_status))

##          Var1  Freq
## 1 Charged_off 15439
## 2  Fully_Paid 40207

Charged_off and Fully_paid is unbalanced. Will use down sampling while build models.

Combining Application type

There are two types of application type, individual and joint. But there are only 10% application is joint application. I want to transform related features before formal data cleansing, in case all join application info get drop off.

df = df %>% 
  mutate(annual_inc = ifelse(application_type == "Joint App",annual_inc_joint,annual_inc), 
         dti = ifelse(application_type == "Joint App", dti_joint,dti),
         verification_status = ifelse(application_type == "Joint App",
                                      verification_status_joint,verification_status)) %>%
  select(-c(annual_inc_joint,dti_joint,verification_status_joint))

drop columns that have no variance or low variance

No Variance, and low variance features don’t bring any useful info to us model. so we are going to drop them.

nzv = colnames(df)[nearZeroVar(df, allowParallel = TRUE)]
cat("Zero or low Variance Columns are: \n", nzv)

## Zero or low Variance Columns are: 
##  member_id pymnt_plan desc total_rec_late_fee recoveries collection_recovery_fee collections_12_mths_ex_med policy_code acc_now_delinq tot_coll_amt chargeoff_within_12_mths delinq_amnt num_tl_120dpd_2m num_tl_30dpd num_tl_90g_dpd_24m pct_tl_nvr_dlq tax_liens sec_app_earliest_cr_line sec_app_chargeoff_within_12_mths sec_app_collections_12_mths_ex_med hardship_amount hardship_end_date payment_plan_start_date hardship_dpd debt_settlement_flag debt_settlement_flag_date settlement_status settlement_date

data_wo_nzv = df %>% select(-all_of(nzv)) # drop no and low variance features

We have dropped 28 features.

Drop features missing more than 50% data

A feature missing more than 50%, then we want to drop them, as it does not bring extra info to our model.

# checking fraction of missing value for each feature
fraction=c()
per = c()
# check every columns that missing data is above 50%
for(i in 1:dim(data_wo_nzv)[2]){
  temp = sum(is.na(data_wo_nzv[,i]))
  perc = temp/dim(data_wo_nzv)[1]
  per = append(per, perc)
  fraction=rbind(fraction,c(colnames(data_wo_nzv)[i], temp, perc))
}
fraction = as.data.frame(fraction)
colnames(fraction)=c("name","count","missing_fraction")
fraction%>%kbl() %>% kable_styling( font_size = 12) %>%scroll_box(width = "100%", height="300px", fixed_thead=T )

name	count	missing_fraction
id	0	0
loan_amnt	0	0
funded_amnt	0	0
funded_amnt_inv	0	0
term	0	0
int_rate	0	0
installment	0	0
grade	0	0
sub_grade	0	0
emp_title	1	1.79707436293714e-05
emp_length	0	0
home_ownership	0	0
annual_inc	0	0
verification_status	0	0
issue_d	0	0
loan_status	0	0
url	0	0
purpose	0	0
title	0	0
zip_code	0	0
addr_state	0	0
dti	0	0
delinq_2yrs	0	0
earliest_cr_line	0	0
fico_range_low	0	0
fico_range_high	0	0
inq_last_6mths	0	0
mths_since_last_delinq	31076	0.558458829026345
mths_since_last_record	49227	0.884645796643065
open_acc	0	0
pub_rec	0	0
revol_bal	0	0
revol_util	0	0
total_acc	0	0
initial_list_status	0	0
out_prncp	0	0
out_prncp_inv	0	0
total_pymnt	0	0
total_pymnt_inv	0	0
total_rec_prncp	0	0
total_rec_int	0	0
last_pymnt_d	0	0
last_pymnt_amnt	0	0
next_pymnt_d	0	0
last_credit_pull_d	0	0
last_fico_range_high	0	0
last_fico_range_low	0	0
mths_since_last_major_derog	43194	0.776228300327068
application_type	0	0
tot_cur_bal	0	0
open_acc_6m	0	0
open_act_il	0	0
open_il_12m	0	0
open_il_24m	0	0
mths_since_rcnt_il	1279	0.022984581101966
total_bal_il	0	0
il_util	7733	0.138967760485929
open_rv_12m	0	0
open_rv_24m	0	0
max_bal_bc	0	0
all_util	20	0.000359414872587428
total_rev_hi_lim	0	0
inq_fi	0	0
total_cu_tl	0	0
inq_last_12m	0	0
acc_open_past_24mths	0	0
avg_cur_bal	10	0.000179707436293714
bc_open_to_buy	872	0.0156704884448118
bc_util	908	0.0163174352154692
mo_sin_old_il_acct	1279	0.022984581101966
mo_sin_old_rev_tl_op	0	0
mo_sin_rcnt_rev_tl_op	0	0
mo_sin_rcnt_tl	0	0
mort_acc	0	0
mths_since_recent_bc	814	0.0146281853143083
mths_since_recent_bc_dlq	44081	0.79216834992632
mths_since_recent_inq	4438	0.0797541602271502
mths_since_recent_revol_delinq	39139	0.703356934909967
num_accts_ever_120_pd	0	0
num_actv_bc_tl	0	0
num_actv_rev_tl	0	0
num_bc_sats	0	0
num_bc_tl	0	0
num_il_tl	0	0
num_op_rev_tl	0	0
num_rev_accts	0	0
num_rev_tl_bal_gt_0	0	0
num_sats	0	0
num_tl_op_past_12m	0	0
percent_bc_gt_75	874	0.0157064299320706
pub_rec_bankruptcies	0	0
tot_hi_cred_lim	0	0
total_bal_ex_mort	0	0
total_bc_limit	0	0
total_il_high_credit_limit	0	0
revol_bal_joint	48503	0.8716349782554
sec_app_fico_range_low	48503	0.8716349782554
sec_app_fico_range_high	48503	0.8716349782554
sec_app_inq_last_6mths	48503	0.8716349782554
sec_app_mort_acc	48503	0.8716349782554
sec_app_open_acc	48503	0.8716349782554
sec_app_revol_util	48646	0.8742047945944
sec_app_open_act_il	48503	0.8716349782554
sec_app_num_rev_accts	48503	0.8716349782554
sec_app_mths_since_last_major_derog	53403	0.95969162203932
hardship_flag	0	0
hardship_type	0	0
hardship_reason	0	0
hardship_status	0	0
deferral_term	49429	0.888275886856198
hardship_start_date	0	0
hardship_length	49429	0.888275886856198
hardship_loan_status	0	0
orig_projected_additional_accrued_interest	42743	0.768123494950221
hardship_payoff_balance_amount	42743	0.768123494950221
hardship_last_payment_amount	42743	0.768123494950221
settlement_amount	55576	0.998742047945944
settlement_percentage	55576	0.998742047945944
settlement_term	55576	0.998742047945944

# Histogram of Feature Incompleteness
hist(per, main="Histogram of Feature Incompleteness", xlab="missing fraction")

# drop columns that missing more than 50% record
data_drop = data_wo_nzv %>% select (-colnames(data_wo_nzv)[per>0.5])
cat("drop columns are:\n", colnames(data_wo_nzv)[per>0.5])

## drop columns are:
##  mths_since_last_delinq mths_since_last_record mths_since_last_major_derog mths_since_recent_bc_dlq mths_since_recent_revol_delinq revol_bal_joint sec_app_fico_range_low sec_app_fico_range_high sec_app_inq_last_6mths sec_app_mort_acc sec_app_open_acc sec_app_revol_util sec_app_open_act_il sec_app_num_rev_accts sec_app_mths_since_last_major_derog deferral_term hardship_length orig_projected_additional_accrued_interest hardship_payoff_balance_amount hardship_last_payment_amount settlement_amount settlement_percentage settlement_term

We have dropped another23 features.

drop columns that don’t need

We want to remove noise features, like url, desc, etc. Also, We want to focus on the information that customers provide to Lending club when they apply the loan to predict the default. We will ignore all features that generated after the loan is approved.

# check current data before futhur feature selection
data_drop[1:10,] %>% kbl() %>% kable_styling( font_size = 12) %>%scroll_box(width = "100%", height="300px", fixed_thead=T )

id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade	emp_title	emp_length	home_ownership	annual_inc	verification_status	issue_d	loan_status	url	purpose	title	zip_code	addr_state	dti	earliest_cr_line	fico_range_low	fico_range_high	inq_last_6mths	open_acc	revol_bal	revol_util	total_acc	initial_list_status	out_prncp	out_prncp_inv	total_pymnt	total_pymnt_inv	total_rec_prncp	total_rec_int	last_pymnt_d	last_pymnt_amnt	next_pymnt_d	last_credit_pull_d	last_fico_range_high	last_fico_range_low	application_type	tot_cur_bal	open_acc_6m	open_act_il	open_il_12m	open_il_24m	mths_since_rcnt_il	total_bal_il	il_util	open_rv_12m	open_rv_24m	max_bal_bc	all_util	total_rev_hi_lim	inq_fi	total_cu_tl	inq_last_12m	acc_open_past_24mths	avg_cur_bal	bc_open_to_buy	bc_util	mo_sin_old_il_acct	mo_sin_old_rev_tl_op	mo_sin_rcnt_rev_tl_op	mo_sin_rcnt_tl	mort_acc	mths_since_recent_bc	mths_since_recent_inq	num_accts_ever_120_pd	num_actv_bc_tl	num_actv_rev_tl	num_bc_sats	num_bc_tl	num_il_tl	num_op_rev_tl	num_rev_accts	num_rev_tl_bal_gt_0	num_sats	num_tl_op_past_12m	percent_bc_gt_75	tot_hi_cred_lim	total_bal_ex_mort	total_bc_limit	total_il_high_credit_limit	hardship_flag	hardship_type	hardship_reason	hardship_status	hardship_start_date	hardship_loan_status
158332903	8000	8000	8000	36 months	10.33%	259.38	B	B1	Processing Manager	6 years	MORTGAGE	116600	3	Sep-19	Fully_Paid	https://lendingclub.com/browse/loanDetail.action?loan_id=158332903	debt_consolidation	Debt consolidation	982xx	WA	36.48	Mar-05	700	704	1	19	67350	55%	34	w	0.00	0.00	8144.882	8144.88	8000.00	144.88	Dec-19	7890.09		Aug-20	704	700	Individual	137633	1	4	3	4	2	70283	74	0	1	19425	64	122400	3	6	3	5	7244	31138	47.5	158	174	13	2	0	13	2	0	4	8	7	10	11	15	23	8	19	3	42.9	216889	137633	59300	94489	N
159383484	25000	25000	25000	36 months	28.80%	1044.93	D	D5	traffic cordinator	10+ years	MORTGAGE	80000	3	Sep-19	Fully_Paid	https://lendingclub.com/browse/loanDetail.action?loan_id=159383484	debt_consolidation	Debt consolidation	351xx	AL	18.15	Sep-08	695	699	0	6	43105	85.90%	30	w	0.00	0.00	25320.000	25320.00	25000.00	320.00	Oct-19	25360.00		Jan-20	744	740	Individual	131078	1	1	0	1	21	26318	76	1	2	2988	82	50200	1	12	0	3	21846	7212	29.3	120	132	3	3	3	17	21	0	1	3	2	5	17	4	10	3	6	1	50.0	157774	69423	10200	34574	N
159523765	15950	15950	15950	36 months	28.80%	666.67	D	D5		< 1 year	RENT	125000	3	Sep-19	Charged_off	https://lendingclub.com/browse/loanDetail.action?loan_id=159523765	debt_consolidation	Debt consolidation	331xx	FL	20.75	May-06	680	684	0	11	32528	47.30%	45	w	0.00	0.00	1891.150	1891.15	0.00	0.00		0.00		Apr-20	544	540	Individual	50854	1	1	0	2	13	18326	64	5	5	14173	52	68800	2	1	4	8	4623	29864	51.7	91	160	4	4	1	4	7	0	6	7	9	27	9	10	33	7	11	6	33.3	97398	50854	61800	28598	N
159568503	9900	9900	9900	36 months	8.19%	311.10	A	A4	Teacher	9 years	MORTGAGE	333000	1	Sep-19	Fully_Paid	https://lendingclub.com/browse/loanDetail.action?loan_id=159568503	credit_card	Credit card refinancing	212xx	MD	6.38	Sep-04	715	719	0	11	18633	54%	30	w	0.00	0.00	10203.490	10203.49	9900.00	303.49	Feb-20	8963.59		Mar-20	674	670	Joint App	503648	1	3	1	2	11	84319	74	2	3	5847	57	34500	0	3	0	5	45786	8967	65.8	168	180	5	5	1	5	16	4	5	6	5	6	16	7	13	6	11	3	60.0	539572	102952	26200	87770	N
159216296	16000	16000	16000	60 months	8.19%	325.88	A	A4	Business Agent	2 years	MORTGAGE	116000	2	Sep-19	Fully_Paid	https://lendingclub.com/browse/loanDetail.action?loan_id=159216296	credit_card	Credit card refinancing	554xx	MN	15.69	Feb-04	720	724	0	25	44990	20.70%	68	w	0.00	0.00	16417.115	16417.12	16000.00	417.12	Dec-19	15772.64		Jan-20	764	760	Individual	53953	1	2	0	0	31	8963	30	2	5	10900	22	217000	0	4	2	6	2158	171110	19.3	140	187	9	2	4	9	3	0	13	14	22	32	20	23	44	14	25	3	0.0	246763	53953	212000	29763	N
159602788	15950	15950	15950	36 months	20.55%	597.24	D	D2	Project Manager	8 years	MORTGAGE	105000	3	Sep-19	Charged_off	https://lendingclub.com/browse/loanDetail.action?loan_id=159602788	credit_card	Credit card refinancing	776xx	TX	27.61	Sep-06	710	714	0	10	11978	22.80%	38	w	0.00	0.00	2967.990	2967.99	1685.29	1282.70	Feb-20	597.24		Aug-20	554	550	Individual	288166	0	3	0	3	18	114203	NA	1	3	9489	23	52500	0	0	5	6	28817	40522	22.8	156	78	9	9	2	9	1	0	5	5	6	7	25	6	11	5	10	1	33.3	362222	126181	52500	138559	N
159026517	4200	4200	4200	36 months	20.55%	157.27	D	D2	Rural Carrier associate	3 years	MORTGAGE	44500	4	Sep-19	Fully_Paid	https://lendingclub.com/browse/loanDetail.action?loan_id=159026517	debt_consolidation	Debt consolidation	760xx	TX	35.30	Jun-03	670	674	1	16	15665	56.10%	22	w	0.00	0.00	4670.050	4670.05	4200.00	470.05	May-20	3731.23		May-20	674	670	Individual	133206	2	4	2	3	10	17370	58	3	4	4555	57	27900	2	0	4	7	8325	3483	79.6	82	81	3	3	1	3	2	0	6	10	6	8	6	11	15	10	16	5	66.7	197254	33035	17100	29979
158884774	27200	27200	27200	60 months	15.24%	650.52	C	C2	Dentist	1 year	RENT	150000	3	Sep-19	Charged_off	https://lendingclub.com/browse/loanDetail.action?loan_id=158884774	credit_card	Credit card refinancing	029xx	RI	23.66	Feb-05	695	699	0	12	31561	51.70%	27	w	0.00	0.00	2611.580	2611.58	1238.32	1340.73	Mar-20	0.00		Aug-20	564	560	Individual	863114	3	7	2	2	3	831553	92	1	1	29624	68	61000	0	0	0	3	86311	29439	51.7	175	159	4	3	0	4	NA	0	2	2	3	4	23	3	4	2	12	3	66.7	724574	863114	61000	663574	N
159598770	5250	5250	5250	36 months	7.56%	163.46	A	A3	Product Manager	6 years	RENT	96000	2	Sep-19	Fully_Paid	https://lendingclub.com/browse/loanDetail.action?loan_id=159598770	debt_consolidation	Debt consolidation	275xx	NC	2.09	Apr-10	720	724	0	6	3012	5.70%	16	w	0.00	0.00	5376.050	5376.05	5250.00	126.05	Feb-20	3887.87		Aug-20	724	720	Individual	139796	0	0	1	1	8	0	NA	3	3	1461	6	52500	1	0	2	4	23299	36039	3.9	56	113	11	8	1	12	8	0	1	2	2	6	2	5	13	2	6	4	0.0	214511	3012	37500	0	N
158744038	26000	26000	26000	36 months	15.24%	904.36	C	C2	Security guard	< 1 year	RENT	89000	3	Sep-19	Charged_off	https://lendingclub.com/browse/loanDetail.action?loan_id=158744038	debt_consolidation	Debt consolidation	114xx	NY	7.42	Sep-08	750	754	1	7	0	0%	15	w	22443.41	22443.41	5404.150	5404.15	3556.59	1847.56	May-20	0.00	Sep-20	Aug-20	639	635	Joint App	22251	1	2	2	3	5	22251	92	1	2	0	51	19200	2	0	2	5	3179	18000	0.0	16	132	11	5	0	24	5	0	0	0	4	8	3	5	12	0	7	3	0.0	43300	22251	18000	24100	N	CVD19SKIP	INCOMECURT	COMPLETE	Apr-20	ACTIVE

columns_keep = c('loan_amnt','term','sub_grade',"int_rate",'emp_length','home_ownership','annual_inc','verification_status','purpose','addr_state','dti','delinq_2yrs','earliest_cr_line','fico_range_low','fico_range_high', "inq_last_6mths", 'open_acc','pub_rec','total_acc',"initial_list_status","application_type","all_util","bc_util",'revol_util','mort_acc', 'num_accts_ever_120_pd', "percent_bc_gt_75", 'pub_rec_bankruptcies',"num_tl_op_past_12m","num_actv_rev_tl",'loan_status')  

LC = data_drop %>%select(all_of(columns_keep))
col_drops = data_drop %>%select(-all_of(columns_keep))%>%colnames
cat("columns that droped: \n", col_drops)

## columns that droped: 
##  id funded_amnt funded_amnt_inv installment grade emp_title issue_d url title zip_code revol_bal out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d last_fico_range_high last_fico_range_low tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m mths_since_rcnt_il total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc total_rev_hi_lim inq_fi total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal bc_open_to_buy mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl mths_since_recent_bc mths_since_recent_inq num_actv_bc_tl num_bc_sats num_bc_tl num_il_tl num_op_rev_tl num_rev_accts num_rev_tl_bal_gt_0 num_sats tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit hardship_flag hardship_type hardship_reason hardship_status hardship_start_date hardship_loan_status

# remaining features
LC[1:10,] %>% kbl() %>% kable_styling( font_size = 12) %>%scroll_box(width = "100%", height="300px", fixed_thead=T )

loan_amnt	term	sub_grade	int_rate	emp_length	home_ownership	annual_inc	verification_status	purpose	addr_state	dti	earliest_cr_line	fico_range_low	fico_range_high	inq_last_6mths	open_acc	total_acc	initial_list_status	application_type	all_util	bc_util	revol_util	mort_acc	num_accts_ever_120_pd	percent_bc_gt_75	num_tl_op_past_12m	num_actv_rev_tl	loan_status
8000	36 months	B1	10.33%	6 years	MORTGAGE	116600	3	debt_consolidation	WA	36.48	Mar-05	700	704	1	19	34	w	Individual	64	47.5	55%	0	0	42.9	3	8	Fully_Paid
25000	36 months	D5	28.80%	10+ years	MORTGAGE	80000	3	debt_consolidation	AL	18.15	Sep-08	695	699	0	6	30	w	Individual	82	29.3	85.90%	3	0	50.0	1	3	Fully_Paid
15950	36 months	D5	28.80%	< 1 year	RENT	125000	3	debt_consolidation	FL	20.75	May-06	680	684	0	11	45	w	Individual	52	51.7	47.30%	1	0	33.3	6	7	Charged_off
9900	36 months	A4	8.19%	9 years	MORTGAGE	333000	1	credit_card	MD	6.38	Sep-04	715	719	0	11	30	w	Joint App	57	65.8	54%	1	4	60.0	3	6	Fully_Paid
16000	60 months	A4	8.19%	2 years	MORTGAGE	116000	2	credit_card	MN	15.69	Feb-04	720	724	0	25	68	w	Individual	22	19.3	20.70%	4	0	0.0	3	14	Fully_Paid
15950	36 months	D2	20.55%	8 years	MORTGAGE	105000	3	credit_card	TX	27.61	Sep-06	710	714	0	10	38	w	Individual	23	22.8	22.80%	2	0	33.3	1	5	Charged_off
4200	36 months	D2	20.55%	3 years	MORTGAGE	44500	4	debt_consolidation	TX	35.30	Jun-03	670	674	1	16	22	w	Individual	57	79.6	56.10%	1	0	66.7	5	10	Fully_Paid
27200	60 months	C2	15.24%	1 year	RENT	150000	3	credit_card	RI	23.66	Feb-05	695	699	0	12	27	w	Individual	68	51.7	51.70%	0	0	66.7	3	2	Charged_off
5250	36 months	A3	7.56%	6 years	RENT	96000	2	debt_consolidation	NC	2.09	Apr-10	720	724	0	6	16	w	Individual	6	3.9	5.70%	1	0	0.0	4	2	Fully_Paid
26000	36 months	C2	15.24%	< 1 year	RENT	89000	3	debt_consolidation	NY	7.42	Sep-08	750	754	1	7	15	w	Joint App	51	0.0	0%	0	0	0.0	3	0	Charged_off

cleaning data with NA

# check missing data again
fraction=c()
for(i in 1:dim(LC)[2]){
  temp = sum(is.na(LC[,i]))
  perc = temp/dim(LC)[1]
  fraction=rbind(fraction,c(colnames(LC)[i], temp, perc))
}
fraction = as.data.frame(fraction)
colnames(fraction)=c("name","count","missing_fraction")
fraction%>%kbl() %>% kable_styling( font_size = 12) %>%scroll_box(width = "100%", height="300px", fixed_thead=T )

name	count	missing_fraction
loan_amnt	0	0
term	0	0
sub_grade	0	0
int_rate	0	0
emp_length	0	0
home_ownership	0	0
annual_inc	0	0
verification_status	0	0
purpose	0	0
addr_state	0	0
dti	0	0
delinq_2yrs	0	0
earliest_cr_line	0	0
fico_range_low	0	0
fico_range_high	0	0
inq_last_6mths	0	0
open_acc	0	0
pub_rec	0	0
total_acc	0	0
initial_list_status	0	0
application_type	0	0
all_util	20	0.000359414872587428
bc_util	908	0.0163174352154692
revol_util	0	0
mort_acc	0	0
num_accts_ever_120_pd	0	0
percent_bc_gt_75	874	0.0157064299320706
pub_rec_bankruptcies	0	0
num_tl_op_past_12m	0	0
num_actv_rev_tl	0	0
loan_status	0	0

# As we have 55671 rows,  we can just drop all records with missing value 
LC = LC%>%drop_na
as.data.frame(table(LC$loan_status))

##          Var1  Freq
## 1 Charged_off 15178
## 2  Fully_Paid 39558

data transformation

earliest_cr_line is the month the borrower’s earliest reported credit line was opened. we want to convert it to a numeric value that show how many years that the borrower own credit line since the first one is opened.

# convert the earliest_cr_line to credit history in year
convert_year <- function(x, year=1940){
  m <- year(x) %% 100
  year(x) <- ifelse(m > year %% 100, 1900+m, 2000+m)
  x
}

LC = LC%>%mutate(cr_year = earliest_cr_line %>% 
                                         paste0("01-",.) %>%
                                         as.Date(format="%d-%B-%y") %>% 
                                         convert_year () %>% 
                                         difftime(Sys.Date(), .) %>% 
                                         time_length("years") %>%
                                         round(0)) %>% select (-earliest_cr_line)

take the average of fico score high and low.

# average fico score high and low and assign it to fico_score
LC = LC %>% mutate(fico_score = 0.5*fico_range_low +0.5*fico_range_high) %>%
                   select(-c(fico_range_low,fico_range_high))

correct the data type

table(sapply(LC[1,], class))

## 
## character    factor   integer   numeric 
##         1        10        13         6

cat("Character columns: ", LC %>% select_if(is.character) %>% colnames, "\n")

## Character columns:  loan_status

# Convert loan_status to Factor
LC$loan_status = as.factor(LC$loan_status)
cat("Factor columns: ", LC %>% select_if(is.factor )%>% colnames, "\n")

## Factor columns:  term sub_grade int_rate emp_length home_ownership purpose addr_state initial_list_status application_type revol_util loan_status

# Convert int_rate should to Numeric
LC$int_rate = as.numeric(sub("%","",LC$int_rate))/100

# Convert revol_util should to Numeric
LC$revol_util = as.numeric(sub("%","",LC$revol_util))/100

# substract year from emp_length
LC$emp_length = sub("years","",LC$emp_length)
LC$emp_length = as.factor(sub("year","",LC$emp_length))

cat("Numeric columns: ", LC %>% select_if(is.numeric) %>% colnames,"\n")

## Numeric columns:  loan_amnt int_rate annual_inc verification_status dti delinq_2yrs inq_last_6mths open_acc pub_rec total_acc all_util bc_util revol_util mort_acc num_accts_ever_120_pd percent_bc_gt_75 pub_rec_bankruptcies num_tl_op_past_12m num_actv_rev_tl cr_year fico_score

# Convert Verification status to Factor
LC$verification_status = as.factor(LC$verification_status)

# Convert dti, all_util, bc_util, percent_bc_gt_75 into decimal
LC$dti = LC$dti/100
LC$all_util = LC$all_util/100
LC$bc_util = LC$bc_util/100
LC$percent_bc_gt_75 = LC$percent_bc_gt_75/100

Checking the data type again after converting.

# Check the class type after convert 
table(sapply(LC[1,], class))

## 
##  factor integer numeric 
##      10      11       9

# check any n/a value 
anyNA(LC)

## [1] FALSE

Exploratory Data Analysis (EDA)

Factors

# term
term_bar = ggplot(data=LC, aes(x = term, fill= loan_status))+geom_bar( position = 'fill') #+ theme(legend.position = "top")

# sub_grade
Subgrade_bar = ggplot(data=LC, aes(x = sub_grade, fill= loan_status))+geom_bar(position = "fill")# + theme(legend.position = "top")

# emp_length
emp_length_bar = ggplot(data=LC, aes(y = emp_length, fill= loan_status))+geom_bar(position = "fill") + theme(legend.position = "top")

# home ownership
home_ownership_bar = ggplot(data=LC, aes(y = home_ownership, fill= loan_status))+geom_bar(position = "fill") #+ theme(legend.position = "top")

# verifcation_status
verification_status_bar = ggplot(data=LC, aes(x = verification_status, fill= loan_status))+geom_bar(position = "fill") #+ theme(legend.position = "top")

#purpose
purpose_bar = ggplot(data=LC, aes(y = purpose, fill= loan_status))+geom_bar(position = "fill")  + theme(legend.position = "top")

# initial_list_status
initial_list_status_bar = ggplot(data=LC, aes(x = initial_list_status, fill= loan_status))+geom_bar(position = "fill") #+ theme(legend.position = "top")

# application_type
application_type_bar = ggplot(data=LC, aes(x = application_type, fill= loan_status))+geom_bar(position = "fill") #+ theme(legend.position = "top")


figure.1 <- grid.arrange(term_bar, verification_status_bar,home_ownership_bar,initial_list_status_bar,application_type_bar,ncol =2)

figure.2 <- grid.arrange(Subgrade_bar,  ncol = 1)

figure.3 <- grid.arrange( emp_length_bar,purpose_bar,  ncol = 1)

Charge off Rate by state.

# bar plot  based on addr_state
ggplot(data = LC, aes(y=addr_state, fill=loan_status)) + geom_bar(position="fill")

# Group the state and calculate the percentage of the default loan in each state
summary <- LC %>%
  group_by(addr_state) %>%
  summarise(percentage = (sum(loan_status == "Charged_off")/(sum(loan_status == "Charged_off")+sum(loan_status == "Fully_Paid"))))

summary = summary[c(2,1,3:50),]



library(usmap)

summary$full = summary$addr_state
summary$full = as.factor(summary$full)
levels(summary$full) = c("","Alaska","Alabama","Arkansas", "Arizona", "California","Colorado","Connecticut","District of Columbia","Delaware","Florida","Georgia","Hawaii","Iowa","Idaho","Illinois","Indiana","Kansas","Kentucky","Louisiana","Massachusetts","Maryland","Maine","Michigan","Minnesota","Missouri","Mississippi","Montana","North Carolina","Nebraska","New Hampshire","New Jersey","New Mexico","Nevada","New York","Ohio","Oklahoma","Oregon","Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah","Virginia","Vermont","Washington","Wisconsin","West Virginia","Wyoming","")
us_states = usmap::us_map("state")

loanstatusperstate = merge(us_states, summary, by="full")[,c(8,11)]

plot_usmap(data=loanstatusperstate, labels = TRUE, values="percentage") + theme(legend.position = "right") + scale_fill_continuous(name = "Default Loan Percentage") + labs(title = "Default Loan Percentage by States")

check the relationship between purpose and loan_amnt

# check the relationship between purpose and loan_amnt
purpose_box=ggplot(data=LC, aes( y=purpose, x=loan_amnt,fill=loan_status)) + geom_boxplot()
figure.purpose <- grid.arrange( purpose_bar,purpose_box,  ncol = 1)

checking the correlation between_grade and int_rate

# checking the correlation between_grade and int_rate
ggplot(data=LC, aes( y=sub_grade, x=int_rate,)) + geom_boxplot()

# remove the outliers at D1 and D2 with interest rate lower than 0.1
LC = LC[!(LC$sub_grade%in% c("D1","D2") & LC$int_rate < 0.1),]


table(LC$emp_length)

## 
##  < 1     1   10+     2     3     4     5     6     7     8     9    n/a 
##  6709  3669 16906  4669  4152  3206  3582  2448  1876  1701  1362  4451

initial_list_status have low relationship to loan status, we can drop this features.
grade has linear relation to the loan status, I will convert it to numeric 1-20, A1 is the highest 20, D5 is lowest 1.
I also convert the employment length into numeric

Drop andTransform factor features

# drop initial_list_status
LC = LC %>% select(-c(initial_list_status))

LC.data = LC %>% mutate(sub_grade = case_when(sub_grade == "A1" ~ 20,
                                     sub_grade == "A2" ~ 19,
                                     sub_grade == "A3" ~ 18,
                                     sub_grade == "A4" ~ 17,
                                     sub_grade == "A5" ~ 16,
                                     sub_grade == "B1" ~ 15,
                                     sub_grade == "B2" ~ 14,
                                     sub_grade == "B3" ~ 13,
                                     sub_grade == "B4" ~ 12,
                                     sub_grade == "B5" ~ 11,
                                     sub_grade == "C1" ~ 10,
                                     sub_grade == "C2" ~ 9,
                                     sub_grade == "C3" ~ 8,
                                     sub_grade == "C4" ~ 7,
                                     sub_grade == "C5" ~ 6,
                                     sub_grade == "D1" ~ 5,
                                     sub_grade == "D2" ~ 4,
                                     sub_grade == "D3" ~ 3,
                                     sub_grade == "D4" ~ 2,
                                     sub_grade == "D5" ~ 1,
                                     ),
                   #purpose = as.factor(case_when(purpose %in% c("credit_card", "debt_consolidation", "small_business") ~ "Finance",
                  #                     purpose %in% c("home_improvement", "house") ~ "house_related",
                   #                    purpose %in% c("renewable_energy", "medical","moving") ~ "house_related",
                    #                   purpose %in% c("car", "vacation","other", "major_purchase") ~ "other"
                     #                  )),
                   emp_length  = as.numeric(case_when(emp_length == "10+ " ~ "10",
                                           emp_length == "< 1 " ~ "0.5",
                                           emp_length == "n/a" ~ "0",
                                           TRUE ~ as.character(emp_length))))
                                          



emp_length_bar = ggplot(data=LC.data, aes(y = emp_length, fill= loan_status))+geom_bar(position = "fill") + theme(legend.position = "top")

Quantitive futures correlation

anyNA(LC.data)

## [1] FALSE

# check correlation between all quantitive futures
loanQuanCorr = cor(LC.data %>% select_if(is.numeric))
corrplot(loanQuanCorr, order ='hclust',type="upper", tl.cex = 0.8)

(highCorr = findCorrelation(loanQuanCorr, 0.7, verbose=T, names =T))

## Compare row 14  and column  13 with corr  0.907 
##   Means:  0.24 vs 0.13 so flagging column 14 
## Compare row 13  and column  17 with corr  0.844 
##   Means:  0.2 vs 0.122 so flagging column 13 
## Compare row 2  and column  3 with corr  0.975 
##   Means:  0.169 vs 0.114 so flagging column 2 
## Compare row 11  and column  9 with corr  0.709 
##   Means:  0.196 vs 0.107 so flagging column 11 
## Compare row 10  and column  18 with corr  0.999 
##   Means:  0.101 vs 0.1 so flagging column 10 
## All correlations <= 0.7

## [1] "revol_util" "bc_util"    "sub_grade"  "total_acc"  "pub_rec"

# remove features that  correlation greater than 0.7 

LC.data = LC.data %>% select(-c(pub_rec, revol_util, bc_util, sub_grade, total_acc))

remove pub_rec, revol_util, bc_util, sub_grade, total_acc

Quantitive futures visuallization

checking numeric features

Loan_amount

Loan_amount_den=ggplot(data=LC.data) + geom_density(mapping= aes(x = loan_amnt,fill=loan_status), alpha=0.7) + theme(legend.position = "top")
loan_amount_box = ggplot(data=LC.data, aes( x=loan_status, y=loan_amnt,)) + geom_boxplot()
figure.4 <- grid.arrange( Loan_amount_den,loan_amount_box,  ncol = 2)

Charged_off customer usually have higher loan amount.

interest rate

int_rate_den = ggplot(data=LC.data) + geom_density(mapping= aes(x = int_rate,fill=loan_status), alpha =0.7)
int_rate_box = ggplot(data=LC.data, aes( x=loan_status, y=int_rate,)) + geom_boxplot()
figure.5 <- grid.arrange( int_rate_den,int_rate_box,  ncol = 2)

# there are few outliers for customer who fully paid
LC.data = LC.data %>% filter(int_rate <=0.3 )

higher interest rate loan have higher chance turn into charged off loan.
There are few outliers, remove it.

annual income

annual_inc_den = ggplot(data=LC.data) + geom_density(mapping= aes(x = annual_inc,fill=loan_status), alpha =0.7)+ theme(legend.position = "top")
annual_inc_box = ggplot(data=LC.data, aes( x=loan_status, y=annual_inc,)) + geom_boxplot()
figure.income <- grid.arrange( annual_inc_den,annual_inc_box,  ncol = 2)

summary(LC.data$annual_inc)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4000   55000   80000   98866  120000 9100000

quantile(LC.data$annual_inc,c(0.95, 0.975))

##      95%    97.5% 
## 213759.5 265000.0

#  lots of outliers, remove top 2.5% customer
LC.data = LC.data %>% filter(annual_inc < 189000  )

# After remove outliers
inc_den = ggplot(data=LC.data) + geom_density(mapping= aes(x = annual_inc,fill=loan_status), alpha =0.7)+ theme(legend.position = "top")
inc_box = ggplot(data=LC.data, aes( x=loan_status, y=annual_inc,)) + geom_boxplot() + ggtitle("After Remove Annual Income Outlier")
figure.income_after <- grid.arrange( inc_den,inc_box,  ncol = 2)

There are few outliers, annual income is over millions, I remove top 5% quantile of annua income Higher income customer may have slightly higher chance paid off the loan

dti

A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.

dti_den = ggplot(data=LC.data) + geom_density(mapping= aes(x = dti,fill=loan_status), alpha=.5)+ theme(legend.position = "top")
dti_box = ggplot(data=LC.data, aes( x=loan_status, y=dti,)) + geom_boxplot()
figure.7 <- grid.arrange( dti_den,dti_box,  ncol = 2)

Charged Off customer usually have higher dti

all_util

Balance to credit limit on all trades

all_util_bar = ggplot(data=LC.data) + geom_density(mapping= aes(x = all_util,fill=loan_status), alpha=0.7)+ theme(legend.position = "top")
all_util_box = ggplot(data=LC.data, aes( x=loan_status, y=all_util,)) + geom_boxplot()
figure.8 <- grid.arrange( all_util_bar,all_util_box,  ncol = 2)

summary(LC.data$all_util)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3900  0.5500  0.5365  0.6900  1.8500

quantile(LC.data$all_util, 0.975)

## 97.5% 
##  0.93

#LC.data = LC.data %>% filter (all_util<=1.2)

fico_score

fico_density =ggplot(data=LC.data) + geom_density(mapping= aes(x = fico_score,fill=loan_status), alpha = 0.5)+ theme(legend.position = "top")
fico_box =ggplot(data=LC.data, aes( x= loan_status, y=fico_score,)) + geom_boxplot() 
figure.fico<- grid.arrange( fico_density,fico_box,  ncol = 2)

####cr_year year of borrower’s earliest reported credit line was opened

cr_density=ggplot(data=LC.data) + geom_density(mapping= aes(x = cr_year,fill=loan_status), alpha = 0.5)+ theme(legend.position = "top")
cr_box =ggplot(data=LC.data, aes( x= loan_status, y=cr_year,)) + geom_boxplot() 
figure.fico<- grid.arrange( cr_density,cr_box,  ncol = 2)

summary(LC.data$cr_year)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   12.00   16.00   17.07   20.00   77.00

quantile(LC.data$cr_year, 0.975)

## 97.5% 
##    36

LC.data = LC.data %>% filter(LC.data$cr_year<=50)

##percent_bc_gt_75

bc_gt_75_density =ggplot(data=LC.data) + geom_density(mapping= aes(x = percent_bc_gt_75,fill=loan_status), position="dodge", alpha=0.7)+ theme(legend.position = "top")
bc_gt_75_box =ggplot(data=LC.data, aes( x= loan_status, y=percent_bc_gt_75,)) + geom_boxplot() 

figure.bc_gt_75 <- grid.arrange( bc_gt_75_density,bc_gt_75_box,  ncol = 2)

## Warning: Width not defined. Set with `position_dodge(width = ?)`

open_acc

open_acc.density = ggplot(data=LC.data) + geom_density(mapping= aes(x = open_acc,fill=loan_status), alpha=.5)+ theme(legend.position = "top")
open_acc.box = ggplot(data=LC.data, aes(x = loan_status, y= open_acc)) + geom_boxplot()
figure.open_acc=grid.arrange( open_acc.density,open_acc.box,  ncol = 2)

summary(LC.data$open_acc)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    8.00   11.00   11.78   15.00   67.00

quantile(LC.data$open_acc, .975)

## 97.5% 
##    27

# remove top 5% quantile open_acc
LC.data = LC.data%>%filter(open_acc<=27)

num_actv_rev_tl

num_actv.density = ggplot(data=LC.data) + geom_bar(mapping= aes(x = num_actv_rev_tl,fill=loan_status), position="dodge")+ theme(legend.position = "top")
num_actv.box = ggplot(data=LC.data, aes(x = loan_status, y= num_actv_rev_tl)) + geom_boxplot()
figure.actv=grid.arrange( num_actv.density,num_actv.box,  ncol = 2)

plot(LC$open_acc, LC$num_actv_rev_tl)

We can remove open_acc

###delinq_2yrs The number of 30+ days past-due incidences of delinquency in the borrower’s credit file for the past 2 years

ggplot(data=LC.data) + geom_bar(mapping= aes(x = delinq_2yrs, fill=loan_status), position="dodge")

# group more than 1 delinquency into one group
LC.data = LC.data %>% mutate(delinq_2yrs = case_when(delinq_2yrs >=1 ~ 1,
                                                      delinq_2yrs ==0 ~ 0))

ggplot(data=LC.data) + geom_bar(mapping= aes(x = delinq_2yrs, fill=loan_status), position="fill") + ggtitle("deling_2yrs After Grouping")

After grouping , there is no pattern that number of delinquency for the past 2 year would impact the prediction.

##inq_last_6mths The number of inquiries in past 6 months (excluding auto and mortgage inquiries)

#inq_last_6mths
ggplot(data=LC.data) + geom_bar(mapping= aes(x = inq_last_6mths,fill=loan_status), position = "dodge")+ theme(legend.position = "top")

LC.data= LC.data %>% mutate(inq_last_6mths = case_when(inq_last_6mths >=1 ~ 1,
                                                       inq_last_6mths ==0 ~ 0))

ggplot(data=LC.data) + geom_bar(mapping= aes(x = inq_last_6mths,fill=loan_status), position ="fill")+ theme(legend.position = "top")

pub_rec_bankruptcies

Number of derogatory public records

ggplot(data=LC.data) + geom_histogram(mapping= aes(x = pub_rec_bankruptcies,fill=loan_status), position="dodge")+ theme(legend.position = "top")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

LC.data= LC.data %>% mutate(pub_rec_bankruptcies = case_when(pub_rec_bankruptcies >=1 ~ 1,
                                                             pub_rec_bankruptcies ==0 ~ 0))

ggplot(data=LC.data) + geom_bar(mapping= aes(x = pub_rec_bankruptcies,fill=loan_status), position ="fill") + ggtitle("After Grouping (pub_rec_bankruptcies)")

pub_rec is not important.

mort_acc

Number of mortgage accounts.

mort_bar=ggplot(data=LC.data) + geom_bar(mapping= aes(x = mort_acc,fill=loan_status), postiion="dodge")+ theme(legend.position = "top")

## Warning: Ignoring unknown parameters: postiion

mort_box=ggplot(data=LC.data, aes( x= loan_status, y=mort_acc,)) + geom_boxplot() 
mort_ownship_box =ggplot(data=LC.data, aes( x =home_ownership, y=mort_acc,fill=loan_status)) + geom_boxplot(position ="dodge")
figure.mort_acc <- grid.arrange( mort_bar,mort_box, mort_ownship_box, ncol = 2)

# remove mort_acc greater than 8
summary(LC.data$mort_acc)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.415   2.000  23.000

quantile(LC.data$mort_acc, 0.95)

## 95% 
##   5

#LC.data = LC.data %>% filter(LC.data$mort_acc<8)

##num_accts_ever_120_pd Number of accounts ever 120 or more days past due

ggplot(data=LC.data) + geom_bar(mapping= aes(x = num_accts_ever_120_pd,fill=loan_status))+ theme(legend.position = "top")

LC.data= LC.data %>% mutate(num_accts_ever_120_pd = case_when(num_accts_ever_120_pd >=1 ~ 1,
                                                              num_accts_ever_120_pd <1 ~ 0))

ggplot(data=LC.data) + geom_bar(mapping= aes(x = num_accts_ever_120_pd,fill=loan_status), position ="fill")+ theme(legend.position = "top")

num_tl_op_past_12m

Number of accounts opened in past 12 months

tl_open_12m_bar =ggplot(data=LC.data) + geom_bar(mapping= aes(x = num_tl_op_past_12m,fill=loan_status), position="dodge")+ theme(legend.position = "top")
tl_open_12m_box = ggplot(data=LC.data, aes( x= loan_status, y=num_tl_op_past_12m,)) + geom_boxplot() 

figure.tl_open <- grid.arrange( tl_open_12m_bar,tl_open_12m_box, ncol = 2)

LC.data = LC.data %>% filter(num_tl_op_past_12m <= 10)

remove unrelated features

LC.data = LC.data %>% select(-c(open_acc, delinq_2yrs, inq_last_6mths, pub_rec_bankruptcies))

check correlation again

loanQuanCorr = cor(LC.data %>% select_if(is.numeric))
corrplot(loanQuanCorr, order ='hclust',type="upper", tl.cex = 0.8)

(highCorr = findCorrelation(loanQuanCorr, 0.7, verbose=T, names =T))

## All correlations <= 0.7

## character(0)

Lending_club

Huiting Sheng

12/7/2020