Lending Club is a US peer-to-peer lending company and the world’s largest peer-to-peer lending platform. As explained by Wikipedia.
The goal of this Project is using the data from Lending club (2019Q3 to 2020Q2), conducting as set of explorator analysis and applying multiple machine learning algorithm techniques to predict the customer default rate.
Data files contain complete loan data for all loans issued through the time period stated, including the current loan status (Current, Late, Fully Paid, etc.) and latest payment information.
Based on the goal, we will only focus on the records with loan status Fully Paid, Charged off, Default and Late. We will remove the records that not in these status. and group Charged off, Default and Late into one call Default.
setwd("/Users/huitingsheng/Downloads/Github/Lending_Club_EDA_and_Modeling")
data1 = read.csv("LoanStats_securev1_2019Q3.csv")
data2 = read.csv("LoanStats_securev1_2019Q4.csv")
data3 = read.csv("LoanStats_securev1_2020Q1.csv")
data4 = read.csv("LoanStats_securev1_2020Q2.csv")
data = rbind(data1,data2,data3, data4)
dim(data)
## [1] 389627 150
packages=c("caret","ggplot2", "tidyverse", "dplyr", "corrplot","e1071", "reshape2","lubridate","usmap", "glmnet", "pROC","doParallel", "ranger","lattice","gridExtra", "kableExtra", "ROSE", "DMwR")
# Now load or install & load all
package.check <- lapply(packages,FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
}
)
Our goal is the predict the default rate of customers,so before we move forward to clean up the data. Let’s combine loan status. We only want records that either fully paid, or default, charged off,or late. Default, Charged off and Late are consider as Risky Customer that we want to prevent.
As we don’t know whether the current customer and issued customer will be fully paid or turn into charged off. so we remove all the current and issued records.and combine Default, Charged off and Late into a group called Charged_off
df = data
as.data.frame(table(df$loan_status))
## Var1 Freq
## 1 16
## 2 Charged Off 6387
## 3 Current 333956
## 4 Default 67
## 5 Fully Paid 40207
## 6 In Grace Period 4070
## 7 Late (16-30 days) 1384
## 8 Late (31-120 days) 3531
## 9 Issued 9
ggplot(data=df,aes(y=loan_status)) + geom_bar()
# remove all current records
df = df %>%
filter(loan_status !="Current" & loan_status !="Issued" & loan_status !="") %>%
mutate(loan_status = if_else(loan_status %in%
c("Fully Paid"," Does not meet the credit policy. Status:Fully Paid "),
"Fully_Paid","Charged_off"))
as.data.frame(table(df$loan_status))
## Var1 Freq
## 1 Charged_off 15439
## 2 Fully_Paid 40207
Charged_off and Fully_paid is unbalanced. Will use down sampling while build models.
There are two types of application type, individual and joint. But there are only 10% application is joint application. I want to transform related features before formal data cleansing, in case all join application info get drop off.
df = df %>%
mutate(annual_inc = ifelse(application_type == "Joint App",annual_inc_joint,annual_inc),
dti = ifelse(application_type == "Joint App", dti_joint,dti),
verification_status = ifelse(application_type == "Joint App",
verification_status_joint,verification_status)) %>%
select(-c(annual_inc_joint,dti_joint,verification_status_joint))
No Variance, and low variance features don’t bring any useful info to us model. so we are going to drop them.
nzv = colnames(df)[nearZeroVar(df, allowParallel = TRUE)]
cat("Zero or low Variance Columns are: \n", nzv)
## Zero or low Variance Columns are:
## member_id pymnt_plan desc total_rec_late_fee recoveries collection_recovery_fee collections_12_mths_ex_med policy_code acc_now_delinq tot_coll_amt chargeoff_within_12_mths delinq_amnt num_tl_120dpd_2m num_tl_30dpd num_tl_90g_dpd_24m pct_tl_nvr_dlq tax_liens sec_app_earliest_cr_line sec_app_chargeoff_within_12_mths sec_app_collections_12_mths_ex_med hardship_amount hardship_end_date payment_plan_start_date hardship_dpd debt_settlement_flag debt_settlement_flag_date settlement_status settlement_date
data_wo_nzv = df %>% select(-all_of(nzv)) # drop no and low variance features
We have dropped 28 features.
A feature missing more than 50%, then we want to drop them, as it does not bring extra info to our model.
# checking fraction of missing value for each feature
fraction=c()
per = c()
# check every columns that missing data is above 50%
for(i in 1:dim(data_wo_nzv)[2]){
temp = sum(is.na(data_wo_nzv[,i]))
perc = temp/dim(data_wo_nzv)[1]
per = append(per, perc)
fraction=rbind(fraction,c(colnames(data_wo_nzv)[i], temp, perc))
}
fraction = as.data.frame(fraction)
colnames(fraction)=c("name","count","missing_fraction")
fraction%>%kbl() %>% kable_styling( font_size = 12) %>%scroll_box(width = "100%", height="300px", fixed_thead=T )
name | count | missing_fraction |
---|---|---|
id | 0 | 0 |
loan_amnt | 0 | 0 |
funded_amnt | 0 | 0 |
funded_amnt_inv | 0 | 0 |
term | 0 | 0 |
int_rate | 0 | 0 |
installment | 0 | 0 |
grade | 0 | 0 |
sub_grade | 0 | 0 |
emp_title | 1 | 1.79707436293714e-05 |
emp_length | 0 | 0 |
home_ownership | 0 | 0 |
annual_inc | 0 | 0 |
verification_status | 0 | 0 |
issue_d | 0 | 0 |
loan_status | 0 | 0 |
url | 0 | 0 |
purpose | 0 | 0 |
title | 0 | 0 |
zip_code | 0 | 0 |
addr_state | 0 | 0 |
dti | 0 | 0 |
delinq_2yrs | 0 | 0 |
earliest_cr_line | 0 | 0 |
fico_range_low | 0 | 0 |
fico_range_high | 0 | 0 |
inq_last_6mths | 0 | 0 |
mths_since_last_delinq | 31076 | 0.558458829026345 |
mths_since_last_record | 49227 | 0.884645796643065 |
open_acc | 0 | 0 |
pub_rec | 0 | 0 |
revol_bal | 0 | 0 |
revol_util | 0 | 0 |
total_acc | 0 | 0 |
initial_list_status | 0 | 0 |
out_prncp | 0 | 0 |
out_prncp_inv | 0 | 0 |
total_pymnt | 0 | 0 |
total_pymnt_inv | 0 | 0 |
total_rec_prncp | 0 | 0 |
total_rec_int | 0 | 0 |
last_pymnt_d | 0 | 0 |
last_pymnt_amnt | 0 | 0 |
next_pymnt_d | 0 | 0 |
last_credit_pull_d | 0 | 0 |
last_fico_range_high | 0 | 0 |
last_fico_range_low | 0 | 0 |
mths_since_last_major_derog | 43194 | 0.776228300327068 |
application_type | 0 | 0 |
tot_cur_bal | 0 | 0 |
open_acc_6m | 0 | 0 |
open_act_il | 0 | 0 |
open_il_12m | 0 | 0 |
open_il_24m | 0 | 0 |
mths_since_rcnt_il | 1279 | 0.022984581101966 |
total_bal_il | 0 | 0 |
il_util | 7733 | 0.138967760485929 |
open_rv_12m | 0 | 0 |
open_rv_24m | 0 | 0 |
max_bal_bc | 0 | 0 |
all_util | 20 | 0.000359414872587428 |
total_rev_hi_lim | 0 | 0 |
inq_fi | 0 | 0 |
total_cu_tl | 0 | 0 |
inq_last_12m | 0 | 0 |
acc_open_past_24mths | 0 | 0 |
avg_cur_bal | 10 | 0.000179707436293714 |
bc_open_to_buy | 872 | 0.0156704884448118 |
bc_util | 908 | 0.0163174352154692 |
mo_sin_old_il_acct | 1279 | 0.022984581101966 |
mo_sin_old_rev_tl_op | 0 | 0 |
mo_sin_rcnt_rev_tl_op | 0 | 0 |
mo_sin_rcnt_tl | 0 | 0 |
mort_acc | 0 | 0 |
mths_since_recent_bc | 814 | 0.0146281853143083 |
mths_since_recent_bc_dlq | 44081 | 0.79216834992632 |
mths_since_recent_inq | 4438 | 0.0797541602271502 |
mths_since_recent_revol_delinq | 39139 | 0.703356934909967 |
num_accts_ever_120_pd | 0 | 0 |
num_actv_bc_tl | 0 | 0 |
num_actv_rev_tl | 0 | 0 |
num_bc_sats | 0 | 0 |
num_bc_tl | 0 | 0 |
num_il_tl | 0 | 0 |
num_op_rev_tl | 0 | 0 |
num_rev_accts | 0 | 0 |
num_rev_tl_bal_gt_0 | 0 | 0 |
num_sats | 0 | 0 |
num_tl_op_past_12m | 0 | 0 |
percent_bc_gt_75 | 874 | 0.0157064299320706 |
pub_rec_bankruptcies | 0 | 0 |
tot_hi_cred_lim | 0 | 0 |
total_bal_ex_mort | 0 | 0 |
total_bc_limit | 0 | 0 |
total_il_high_credit_limit | 0 | 0 |
revol_bal_joint | 48503 | 0.8716349782554 |
sec_app_fico_range_low | 48503 | 0.8716349782554 |
sec_app_fico_range_high | 48503 | 0.8716349782554 |
sec_app_inq_last_6mths | 48503 | 0.8716349782554 |
sec_app_mort_acc | 48503 | 0.8716349782554 |
sec_app_open_acc | 48503 | 0.8716349782554 |
sec_app_revol_util | 48646 | 0.8742047945944 |
sec_app_open_act_il | 48503 | 0.8716349782554 |
sec_app_num_rev_accts | 48503 | 0.8716349782554 |
sec_app_mths_since_last_major_derog | 53403 | 0.95969162203932 |
hardship_flag | 0 | 0 |
hardship_type | 0 | 0 |
hardship_reason | 0 | 0 |
hardship_status | 0 | 0 |
deferral_term | 49429 | 0.888275886856198 |
hardship_start_date | 0 | 0 |
hardship_length | 49429 | 0.888275886856198 |
hardship_loan_status | 0 | 0 |
orig_projected_additional_accrued_interest | 42743 | 0.768123494950221 |
hardship_payoff_balance_amount | 42743 | 0.768123494950221 |
hardship_last_payment_amount | 42743 | 0.768123494950221 |
settlement_amount | 55576 | 0.998742047945944 |
settlement_percentage | 55576 | 0.998742047945944 |
settlement_term | 55576 | 0.998742047945944 |
# Histogram of Feature Incompleteness
hist(per, main="Histogram of Feature Incompleteness", xlab="missing fraction")
# drop columns that missing more than 50% record
data_drop = data_wo_nzv %>% select (-colnames(data_wo_nzv)[per>0.5])
cat("drop columns are:\n", colnames(data_wo_nzv)[per>0.5])
## drop columns are:
## mths_since_last_delinq mths_since_last_record mths_since_last_major_derog mths_since_recent_bc_dlq mths_since_recent_revol_delinq revol_bal_joint sec_app_fico_range_low sec_app_fico_range_high sec_app_inq_last_6mths sec_app_mort_acc sec_app_open_acc sec_app_revol_util sec_app_open_act_il sec_app_num_rev_accts sec_app_mths_since_last_major_derog deferral_term hardship_length orig_projected_additional_accrued_interest hardship_payoff_balance_amount hardship_last_payment_amount settlement_amount settlement_percentage settlement_term
We have dropped another23 features.
We want to remove noise features, like url, desc, etc. Also, We want to focus on the information that customers provide to Lending club when they apply the loan to predict the default. We will ignore all features that generated after the loan is approved.
# check current data before futhur feature selection
data_drop[1:10,] %>% kbl() %>% kable_styling( font_size = 12) %>%scroll_box(width = "100%", height="300px", fixed_thead=T )
id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | url | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | fico_range_low | fico_range_high | inq_last_6mths | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | last_fico_range_high | last_fico_range_low | application_type | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | mths_since_rcnt_il | total_bal_il | il_util | open_rv_12m | open_rv_24m | max_bal_bc | all_util | total_rev_hi_lim | inq_fi | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | bc_open_to_buy | bc_util | mo_sin_old_il_acct | mo_sin_old_rev_tl_op | mo_sin_rcnt_rev_tl_op | mo_sin_rcnt_tl | mort_acc | mths_since_recent_bc | mths_since_recent_inq | num_accts_ever_120_pd | num_actv_bc_tl | num_actv_rev_tl | num_bc_sats | num_bc_tl | num_il_tl | num_op_rev_tl | num_rev_accts | num_rev_tl_bal_gt_0 | num_sats | num_tl_op_past_12m | percent_bc_gt_75 | pub_rec_bankruptcies | tot_hi_cred_lim | total_bal_ex_mort | total_bc_limit | total_il_high_credit_limit | hardship_flag | hardship_type | hardship_reason | hardship_status | hardship_start_date | hardship_loan_status |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
158332903 | 8000 | 8000 | 8000 | 36 months | 10.33% | 259.38 | B | B1 | Processing Manager | 6 years | MORTGAGE | 116600 | 3 | Sep-19 | Fully_Paid | https://lendingclub.com/browse/loanDetail.action?loan_id=158332903 | debt_consolidation | Debt consolidation | 982xx | WA | 36.48 | 0 | Mar-05 | 700 | 704 | 1 | 19 | 0 | 67350 | 55% | 34 | w | 0.00 | 0.00 | 8144.882 | 8144.88 | 8000.00 | 144.88 | Dec-19 | 7890.09 | Aug-20 | 704 | 700 | Individual | 137633 | 1 | 4 | 3 | 4 | 2 | 70283 | 74 | 0 | 1 | 19425 | 64 | 122400 | 3 | 6 | 3 | 5 | 7244 | 31138 | 47.5 | 158 | 174 | 13 | 2 | 0 | 13 | 2 | 0 | 4 | 8 | 7 | 10 | 11 | 15 | 23 | 8 | 19 | 3 | 42.9 | 0 | 216889 | 137633 | 59300 | 94489 | N | ||||||
159383484 | 25000 | 25000 | 25000 | 36 months | 28.80% | 1044.93 | D | D5 | traffic cordinator | 10+ years | MORTGAGE | 80000 | 3 | Sep-19 | Fully_Paid | https://lendingclub.com/browse/loanDetail.action?loan_id=159383484 | debt_consolidation | Debt consolidation | 351xx | AL | 18.15 | 0 | Sep-08 | 695 | 699 | 0 | 6 | 0 | 43105 | 85.90% | 30 | w | 0.00 | 0.00 | 25320.000 | 25320.00 | 25000.00 | 320.00 | Oct-19 | 25360.00 | Jan-20 | 744 | 740 | Individual | 131078 | 1 | 1 | 0 | 1 | 21 | 26318 | 76 | 1 | 2 | 2988 | 82 | 50200 | 1 | 12 | 0 | 3 | 21846 | 7212 | 29.3 | 120 | 132 | 3 | 3 | 3 | 17 | 21 | 0 | 1 | 3 | 2 | 5 | 17 | 4 | 10 | 3 | 6 | 1 | 50.0 | 0 | 157774 | 69423 | 10200 | 34574 | N | ||||||
159523765 | 15950 | 15950 | 15950 | 36 months | 28.80% | 666.67 | D | D5 | < 1 year | RENT | 125000 | 3 | Sep-19 | Charged_off | https://lendingclub.com/browse/loanDetail.action?loan_id=159523765 | debt_consolidation | Debt consolidation | 331xx | FL | 20.75 | 0 | May-06 | 680 | 684 | 0 | 11 | 0 | 32528 | 47.30% | 45 | w | 0.00 | 0.00 | 1891.150 | 1891.15 | 0.00 | 0.00 | 0.00 | Apr-20 | 544 | 540 | Individual | 50854 | 1 | 1 | 0 | 2 | 13 | 18326 | 64 | 5 | 5 | 14173 | 52 | 68800 | 2 | 1 | 4 | 8 | 4623 | 29864 | 51.7 | 91 | 160 | 4 | 4 | 1 | 4 | 7 | 0 | 6 | 7 | 9 | 27 | 9 | 10 | 33 | 7 | 11 | 6 | 33.3 | 0 | 97398 | 50854 | 61800 | 28598 | N | ||||||||
159568503 | 9900 | 9900 | 9900 | 36 months | 8.19% | 311.10 | A | A4 | Teacher | 9 years | MORTGAGE | 333000 | 1 | Sep-19 | Fully_Paid | https://lendingclub.com/browse/loanDetail.action?loan_id=159568503 | credit_card | Credit card refinancing | 212xx | MD | 6.38 | 0 | Sep-04 | 715 | 719 | 0 | 11 | 0 | 18633 | 54% | 30 | w | 0.00 | 0.00 | 10203.490 | 10203.49 | 9900.00 | 303.49 | Feb-20 | 8963.59 | Mar-20 | 674 | 670 | Joint App | 503648 | 1 | 3 | 1 | 2 | 11 | 84319 | 74 | 2 | 3 | 5847 | 57 | 34500 | 0 | 3 | 0 | 5 | 45786 | 8967 | 65.8 | 168 | 180 | 5 | 5 | 1 | 5 | 16 | 4 | 5 | 6 | 5 | 6 | 16 | 7 | 13 | 6 | 11 | 3 | 60.0 | 0 | 539572 | 102952 | 26200 | 87770 | N | ||||||
159216296 | 16000 | 16000 | 16000 | 60 months | 8.19% | 325.88 | A | A4 | Business Agent | 2 years | MORTGAGE | 116000 | 2 | Sep-19 | Fully_Paid | https://lendingclub.com/browse/loanDetail.action?loan_id=159216296 | credit_card | Credit card refinancing | 554xx | MN | 15.69 | 0 | Feb-04 | 720 | 724 | 0 | 25 | 0 | 44990 | 20.70% | 68 | w | 0.00 | 0.00 | 16417.115 | 16417.12 | 16000.00 | 417.12 | Dec-19 | 15772.64 | Jan-20 | 764 | 760 | Individual | 53953 | 1 | 2 | 0 | 0 | 31 | 8963 | 30 | 2 | 5 | 10900 | 22 | 217000 | 0 | 4 | 2 | 6 | 2158 | 171110 | 19.3 | 140 | 187 | 9 | 2 | 4 | 9 | 3 | 0 | 13 | 14 | 22 | 32 | 20 | 23 | 44 | 14 | 25 | 3 | 0.0 | 0 | 246763 | 53953 | 212000 | 29763 | N | ||||||
159602788 | 15950 | 15950 | 15950 | 36 months | 20.55% | 597.24 | D | D2 | Project Manager | 8 years | MORTGAGE | 105000 | 3 | Sep-19 | Charged_off | https://lendingclub.com/browse/loanDetail.action?loan_id=159602788 | credit_card | Credit card refinancing | 776xx | TX | 27.61 | 0 | Sep-06 | 710 | 714 | 0 | 10 | 0 | 11978 | 22.80% | 38 | w | 0.00 | 0.00 | 2967.990 | 2967.99 | 1685.29 | 1282.70 | Feb-20 | 597.24 | Aug-20 | 554 | 550 | Individual | 288166 | 0 | 3 | 0 | 3 | 18 | 114203 | NA | 1 | 3 | 9489 | 23 | 52500 | 0 | 0 | 5 | 6 | 28817 | 40522 | 22.8 | 156 | 78 | 9 | 9 | 2 | 9 | 1 | 0 | 5 | 5 | 6 | 7 | 25 | 6 | 11 | 5 | 10 | 1 | 33.3 | 0 | 362222 | 126181 | 52500 | 138559 | N | ||||||
159026517 | 4200 | 4200 | 4200 | 36 months | 20.55% | 157.27 | D | D2 | Rural Carrier associate | 3 years | MORTGAGE | 44500 | 4 | Sep-19 | Fully_Paid | https://lendingclub.com/browse/loanDetail.action?loan_id=159026517 | debt_consolidation | Debt consolidation | 760xx | TX | 35.30 | 0 | Jun-03 | 670 | 674 | 1 | 16 | 0 | 15665 | 56.10% | 22 | w | 0.00 | 0.00 | 4670.050 | 4670.05 | 4200.00 | 470.05 | May-20 | 3731.23 | May-20 | 674 | 670 | Individual | 133206 | 2 | 4 | 2 | 3 | 10 | 17370 | 58 | 3 | 4 | 4555 | 57 | 27900 | 2 | 0 | 4 | 7 | 8325 | 3483 | 79.6 | 82 | 81 | 3 | 3 | 1 | 3 | 2 | 0 | 6 | 10 | 6 | 8 | 6 | 11 | 15 | 10 | 16 | 5 | 66.7 | 0 | 197254 | 33035 | 17100 | 29979 | |||||||
158884774 | 27200 | 27200 | 27200 | 60 months | 15.24% | 650.52 | C | C2 | Dentist | 1 year | RENT | 150000 | 3 | Sep-19 | Charged_off | https://lendingclub.com/browse/loanDetail.action?loan_id=158884774 | credit_card | Credit card refinancing | 029xx | RI | 23.66 | 0 | Feb-05 | 695 | 699 | 0 | 12 | 0 | 31561 | 51.70% | 27 | w | 0.00 | 0.00 | 2611.580 | 2611.58 | 1238.32 | 1340.73 | Mar-20 | 0.00 | Aug-20 | 564 | 560 | Individual | 863114 | 3 | 7 | 2 | 2 | 3 | 831553 | 92 | 1 | 1 | 29624 | 68 | 61000 | 0 | 0 | 0 | 3 | 86311 | 29439 | 51.7 | 175 | 159 | 4 | 3 | 0 | 4 | NA | 0 | 2 | 2 | 3 | 4 | 23 | 3 | 4 | 2 | 12 | 3 | 66.7 | 0 | 724574 | 863114 | 61000 | 663574 | N | ||||||
159598770 | 5250 | 5250 | 5250 | 36 months | 7.56% | 163.46 | A | A3 | Product Manager | 6 years | RENT | 96000 | 2 | Sep-19 | Fully_Paid | https://lendingclub.com/browse/loanDetail.action?loan_id=159598770 | debt_consolidation | Debt consolidation | 275xx | NC | 2.09 | 0 | Apr-10 | 720 | 724 | 0 | 6 | 0 | 3012 | 5.70% | 16 | w | 0.00 | 0.00 | 5376.050 | 5376.05 | 5250.00 | 126.05 | Feb-20 | 3887.87 | Aug-20 | 724 | 720 | Individual | 139796 | 0 | 0 | 1 | 1 | 8 | 0 | NA | 3 | 3 | 1461 | 6 | 52500 | 1 | 0 | 2 | 4 | 23299 | 36039 | 3.9 | 56 | 113 | 11 | 8 | 1 | 12 | 8 | 0 | 1 | 2 | 2 | 6 | 2 | 5 | 13 | 2 | 6 | 4 | 0.0 | 0 | 214511 | 3012 | 37500 | 0 | N | ||||||
158744038 | 26000 | 26000 | 26000 | 36 months | 15.24% | 904.36 | C | C2 | Security guard | < 1 year | RENT | 89000 | 3 | Sep-19 | Charged_off | https://lendingclub.com/browse/loanDetail.action?loan_id=158744038 | debt_consolidation | Debt consolidation | 114xx | NY | 7.42 | 0 | Sep-08 | 750 | 754 | 1 | 7 | 0 | 0 | 0% | 15 | w | 22443.41 | 22443.41 | 5404.150 | 5404.15 | 3556.59 | 1847.56 | May-20 | 0.00 | Sep-20 | Aug-20 | 639 | 635 | Joint App | 22251 | 1 | 2 | 2 | 3 | 5 | 22251 | 92 | 1 | 2 | 0 | 51 | 19200 | 2 | 0 | 2 | 5 | 3179 | 18000 | 0.0 | 16 | 132 | 11 | 5 | 0 | 24 | 5 | 0 | 0 | 0 | 4 | 8 | 3 | 5 | 12 | 0 | 7 | 3 | 0.0 | 0 | 43300 | 22251 | 18000 | 24100 | N | CVD19SKIP | INCOMECURT | COMPLETE | Apr-20 | ACTIVE |
columns_keep = c('loan_amnt','term','sub_grade',"int_rate",'emp_length','home_ownership','annual_inc','verification_status','purpose','addr_state','dti','delinq_2yrs','earliest_cr_line','fico_range_low','fico_range_high', "inq_last_6mths", 'open_acc','pub_rec','total_acc',"initial_list_status","application_type","all_util","bc_util",'revol_util','mort_acc', 'num_accts_ever_120_pd', "percent_bc_gt_75", 'pub_rec_bankruptcies',"num_tl_op_past_12m","num_actv_rev_tl",'loan_status')
LC = data_drop %>%select(all_of(columns_keep))
col_drops = data_drop %>%select(-all_of(columns_keep))%>%colnames
cat("columns that droped: \n", col_drops)
## columns that droped:
## id funded_amnt funded_amnt_inv installment grade emp_title issue_d url title zip_code revol_bal out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d last_fico_range_high last_fico_range_low tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m mths_since_rcnt_il total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc total_rev_hi_lim inq_fi total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal bc_open_to_buy mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl mths_since_recent_bc mths_since_recent_inq num_actv_bc_tl num_bc_sats num_bc_tl num_il_tl num_op_rev_tl num_rev_accts num_rev_tl_bal_gt_0 num_sats tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit hardship_flag hardship_type hardship_reason hardship_status hardship_start_date hardship_loan_status
# remaining features
LC[1:10,] %>% kbl() %>% kable_styling( font_size = 12) %>%scroll_box(width = "100%", height="300px", fixed_thead=T )
loan_amnt | term | sub_grade | int_rate | emp_length | home_ownership | annual_inc | verification_status | purpose | addr_state | dti | delinq_2yrs | earliest_cr_line | fico_range_low | fico_range_high | inq_last_6mths | open_acc | pub_rec | total_acc | initial_list_status | application_type | all_util | bc_util | revol_util | mort_acc | num_accts_ever_120_pd | percent_bc_gt_75 | pub_rec_bankruptcies | num_tl_op_past_12m | num_actv_rev_tl | loan_status |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
8000 | 36 months | B1 | 10.33% | 6 years | MORTGAGE | 116600 | 3 | debt_consolidation | WA | 36.48 | 0 | Mar-05 | 700 | 704 | 1 | 19 | 0 | 34 | w | Individual | 64 | 47.5 | 55% | 0 | 0 | 42.9 | 0 | 3 | 8 | Fully_Paid |
25000 | 36 months | D5 | 28.80% | 10+ years | MORTGAGE | 80000 | 3 | debt_consolidation | AL | 18.15 | 0 | Sep-08 | 695 | 699 | 0 | 6 | 0 | 30 | w | Individual | 82 | 29.3 | 85.90% | 3 | 0 | 50.0 | 0 | 1 | 3 | Fully_Paid |
15950 | 36 months | D5 | 28.80% | < 1 year | RENT | 125000 | 3 | debt_consolidation | FL | 20.75 | 0 | May-06 | 680 | 684 | 0 | 11 | 0 | 45 | w | Individual | 52 | 51.7 | 47.30% | 1 | 0 | 33.3 | 0 | 6 | 7 | Charged_off |
9900 | 36 months | A4 | 8.19% | 9 years | MORTGAGE | 333000 | 1 | credit_card | MD | 6.38 | 0 | Sep-04 | 715 | 719 | 0 | 11 | 0 | 30 | w | Joint App | 57 | 65.8 | 54% | 1 | 4 | 60.0 | 0 | 3 | 6 | Fully_Paid |
16000 | 60 months | A4 | 8.19% | 2 years | MORTGAGE | 116000 | 2 | credit_card | MN | 15.69 | 0 | Feb-04 | 720 | 724 | 0 | 25 | 0 | 68 | w | Individual | 22 | 19.3 | 20.70% | 4 | 0 | 0.0 | 0 | 3 | 14 | Fully_Paid |
15950 | 36 months | D2 | 20.55% | 8 years | MORTGAGE | 105000 | 3 | credit_card | TX | 27.61 | 0 | Sep-06 | 710 | 714 | 0 | 10 | 0 | 38 | w | Individual | 23 | 22.8 | 22.80% | 2 | 0 | 33.3 | 0 | 1 | 5 | Charged_off |
4200 | 36 months | D2 | 20.55% | 3 years | MORTGAGE | 44500 | 4 | debt_consolidation | TX | 35.30 | 0 | Jun-03 | 670 | 674 | 1 | 16 | 0 | 22 | w | Individual | 57 | 79.6 | 56.10% | 1 | 0 | 66.7 | 0 | 5 | 10 | Fully_Paid |
27200 | 60 months | C2 | 15.24% | 1 year | RENT | 150000 | 3 | credit_card | RI | 23.66 | 0 | Feb-05 | 695 | 699 | 0 | 12 | 0 | 27 | w | Individual | 68 | 51.7 | 51.70% | 0 | 0 | 66.7 | 0 | 3 | 2 | Charged_off |
5250 | 36 months | A3 | 7.56% | 6 years | RENT | 96000 | 2 | debt_consolidation | NC | 2.09 | 0 | Apr-10 | 720 | 724 | 0 | 6 | 0 | 16 | w | Individual | 6 | 3.9 | 5.70% | 1 | 0 | 0.0 | 0 | 4 | 2 | Fully_Paid |
26000 | 36 months | C2 | 15.24% | < 1 year | RENT | 89000 | 3 | debt_consolidation | NY | 7.42 | 0 | Sep-08 | 750 | 754 | 1 | 7 | 0 | 15 | w | Joint App | 51 | 0.0 | 0% | 0 | 0 | 0.0 | 0 | 3 | 0 | Charged_off |
# check missing data again
fraction=c()
for(i in 1:dim(LC)[2]){
temp = sum(is.na(LC[,i]))
perc = temp/dim(LC)[1]
fraction=rbind(fraction,c(colnames(LC)[i], temp, perc))
}
fraction = as.data.frame(fraction)
colnames(fraction)=c("name","count","missing_fraction")
fraction%>%kbl() %>% kable_styling( font_size = 12) %>%scroll_box(width = "100%", height="300px", fixed_thead=T )
name | count | missing_fraction |
---|---|---|
loan_amnt | 0 | 0 |
term | 0 | 0 |
sub_grade | 0 | 0 |
int_rate | 0 | 0 |
emp_length | 0 | 0 |
home_ownership | 0 | 0 |
annual_inc | 0 | 0 |
verification_status | 0 | 0 |
purpose | 0 | 0 |
addr_state | 0 | 0 |
dti | 0 | 0 |
delinq_2yrs | 0 | 0 |
earliest_cr_line | 0 | 0 |
fico_range_low | 0 | 0 |
fico_range_high | 0 | 0 |
inq_last_6mths | 0 | 0 |
open_acc | 0 | 0 |
pub_rec | 0 | 0 |
total_acc | 0 | 0 |
initial_list_status | 0 | 0 |
application_type | 0 | 0 |
all_util | 20 | 0.000359414872587428 |
bc_util | 908 | 0.0163174352154692 |
revol_util | 0 | 0 |
mort_acc | 0 | 0 |
num_accts_ever_120_pd | 0 | 0 |
percent_bc_gt_75 | 874 | 0.0157064299320706 |
pub_rec_bankruptcies | 0 | 0 |
num_tl_op_past_12m | 0 | 0 |
num_actv_rev_tl | 0 | 0 |
loan_status | 0 | 0 |
# As we have 55671 rows, we can just drop all records with missing value
LC = LC%>%drop_na
as.data.frame(table(LC$loan_status))
## Var1 Freq
## 1 Charged_off 15178
## 2 Fully_Paid 39558
earliest_cr_line is the month the borrower’s earliest reported credit line was opened. we want to convert it to a numeric value that show how many years that the borrower own credit line since the first one is opened.
# convert the earliest_cr_line to credit history in year
convert_year <- function(x, year=1940){
m <- year(x) %% 100
year(x) <- ifelse(m > year %% 100, 1900+m, 2000+m)
x
}
LC = LC%>%mutate(cr_year = earliest_cr_line %>%
paste0("01-",.) %>%
as.Date(format="%d-%B-%y") %>%
convert_year () %>%
difftime(Sys.Date(), .) %>%
time_length("years") %>%
round(0)) %>% select (-earliest_cr_line)
take the average of fico score high and low.
# average fico score high and low and assign it to fico_score
LC = LC %>% mutate(fico_score = 0.5*fico_range_low +0.5*fico_range_high) %>%
select(-c(fico_range_low,fico_range_high))
correct the data type
table(sapply(LC[1,], class))
##
## character factor integer numeric
## 1 10 13 6
cat("Character columns: ", LC %>% select_if(is.character) %>% colnames, "\n")
## Character columns: loan_status
# Convert loan_status to Factor
LC$loan_status = as.factor(LC$loan_status)
cat("Factor columns: ", LC %>% select_if(is.factor )%>% colnames, "\n")
## Factor columns: term sub_grade int_rate emp_length home_ownership purpose addr_state initial_list_status application_type revol_util loan_status
# Convert int_rate should to Numeric
LC$int_rate = as.numeric(sub("%","",LC$int_rate))/100
# Convert revol_util should to Numeric
LC$revol_util = as.numeric(sub("%","",LC$revol_util))/100
# substract year from emp_length
LC$emp_length = sub("years","",LC$emp_length)
LC$emp_length = as.factor(sub("year","",LC$emp_length))
cat("Numeric columns: ", LC %>% select_if(is.numeric) %>% colnames,"\n")
## Numeric columns: loan_amnt int_rate annual_inc verification_status dti delinq_2yrs inq_last_6mths open_acc pub_rec total_acc all_util bc_util revol_util mort_acc num_accts_ever_120_pd percent_bc_gt_75 pub_rec_bankruptcies num_tl_op_past_12m num_actv_rev_tl cr_year fico_score
# Convert Verification status to Factor
LC$verification_status = as.factor(LC$verification_status)
# Convert dti, all_util, bc_util, percent_bc_gt_75 into decimal
LC$dti = LC$dti/100
LC$all_util = LC$all_util/100
LC$bc_util = LC$bc_util/100
LC$percent_bc_gt_75 = LC$percent_bc_gt_75/100
Checking the data type again after converting.
# Check the class type after convert
table(sapply(LC[1,], class))
##
## factor integer numeric
## 10 11 9
# check any n/a value
anyNA(LC)
## [1] FALSE
# term
term_bar = ggplot(data=LC, aes(x = term, fill= loan_status))+geom_bar( position = 'fill') #+ theme(legend.position = "top")
# sub_grade
Subgrade_bar = ggplot(data=LC, aes(x = sub_grade, fill= loan_status))+geom_bar(position = "fill")# + theme(legend.position = "top")
# emp_length
emp_length_bar = ggplot(data=LC, aes(y = emp_length, fill= loan_status))+geom_bar(position = "fill") + theme(legend.position = "top")
# home ownership
home_ownership_bar = ggplot(data=LC, aes(y = home_ownership, fill= loan_status))+geom_bar(position = "fill") #+ theme(legend.position = "top")
# verifcation_status
verification_status_bar = ggplot(data=LC, aes(x = verification_status, fill= loan_status))+geom_bar(position = "fill") #+ theme(legend.position = "top")
#purpose
purpose_bar = ggplot(data=LC, aes(y = purpose, fill= loan_status))+geom_bar(position = "fill") + theme(legend.position = "top")
# initial_list_status
initial_list_status_bar = ggplot(data=LC, aes(x = initial_list_status, fill= loan_status))+geom_bar(position = "fill") #+ theme(legend.position = "top")
# application_type
application_type_bar = ggplot(data=LC, aes(x = application_type, fill= loan_status))+geom_bar(position = "fill") #+ theme(legend.position = "top")
figure.1 <- grid.arrange(term_bar, verification_status_bar,home_ownership_bar,initial_list_status_bar,application_type_bar,ncol =2)
figure.2 <- grid.arrange(Subgrade_bar, ncol = 1)
figure.3 <- grid.arrange( emp_length_bar,purpose_bar, ncol = 1)
Charge off Rate by state.
# bar plot based on addr_state
ggplot(data = LC, aes(y=addr_state, fill=loan_status)) + geom_bar(position="fill")
# Group the state and calculate the percentage of the default loan in each state
summary <- LC %>%
group_by(addr_state) %>%
summarise(percentage = (sum(loan_status == "Charged_off")/(sum(loan_status == "Charged_off")+sum(loan_status == "Fully_Paid"))))
summary = summary[c(2,1,3:50),]
library(usmap)
summary$full = summary$addr_state
summary$full = as.factor(summary$full)
levels(summary$full) = c("","Alaska","Alabama","Arkansas", "Arizona", "California","Colorado","Connecticut","District of Columbia","Delaware","Florida","Georgia","Hawaii","Iowa","Idaho","Illinois","Indiana","Kansas","Kentucky","Louisiana","Massachusetts","Maryland","Maine","Michigan","Minnesota","Missouri","Mississippi","Montana","North Carolina","Nebraska","New Hampshire","New Jersey","New Mexico","Nevada","New York","Ohio","Oklahoma","Oregon","Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah","Virginia","Vermont","Washington","Wisconsin","West Virginia","Wyoming","")
us_states = usmap::us_map("state")
loanstatusperstate = merge(us_states, summary, by="full")[,c(8,11)]
plot_usmap(data=loanstatusperstate, labels = TRUE, values="percentage") + theme(legend.position = "right") + scale_fill_continuous(name = "Default Loan Percentage") + labs(title = "Default Loan Percentage by States")
check the relationship between purpose and loan_amnt
# check the relationship between purpose and loan_amnt
purpose_box=ggplot(data=LC, aes( y=purpose, x=loan_amnt,fill=loan_status)) + geom_boxplot()
figure.purpose <- grid.arrange( purpose_bar,purpose_box, ncol = 1)
checking the correlation between_grade and int_rate
# checking the correlation between_grade and int_rate
ggplot(data=LC, aes( y=sub_grade, x=int_rate,)) + geom_boxplot()
# remove the outliers at D1 and D2 with interest rate lower than 0.1
LC = LC[!(LC$sub_grade%in% c("D1","D2") & LC$int_rate < 0.1),]
table(LC$emp_length)
##
## < 1 1 10+ 2 3 4 5 6 7 8 9 n/a
## 6709 3669 16906 4669 4152 3206 3582 2448 1876 1701 1362 4451
initial_list_status have low relationship to loan status, we can drop this features.
grade has linear relation to the loan status, I will convert it to numeric 1-20, A1 is the highest 20, D5 is lowest 1.
I also convert the employment length into numeric
# drop initial_list_status
LC = LC %>% select(-c(initial_list_status))
LC.data = LC %>% mutate(sub_grade = case_when(sub_grade == "A1" ~ 20,
sub_grade == "A2" ~ 19,
sub_grade == "A3" ~ 18,
sub_grade == "A4" ~ 17,
sub_grade == "A5" ~ 16,
sub_grade == "B1" ~ 15,
sub_grade == "B2" ~ 14,
sub_grade == "B3" ~ 13,
sub_grade == "B4" ~ 12,
sub_grade == "B5" ~ 11,
sub_grade == "C1" ~ 10,
sub_grade == "C2" ~ 9,
sub_grade == "C3" ~ 8,
sub_grade == "C4" ~ 7,
sub_grade == "C5" ~ 6,
sub_grade == "D1" ~ 5,
sub_grade == "D2" ~ 4,
sub_grade == "D3" ~ 3,
sub_grade == "D4" ~ 2,
sub_grade == "D5" ~ 1,
),
#purpose = as.factor(case_when(purpose %in% c("credit_card", "debt_consolidation", "small_business") ~ "Finance",
# purpose %in% c("home_improvement", "house") ~ "house_related",
# purpose %in% c("renewable_energy", "medical","moving") ~ "house_related",
# purpose %in% c("car", "vacation","other", "major_purchase") ~ "other"
# )),
emp_length = as.numeric(case_when(emp_length == "10+ " ~ "10",
emp_length == "< 1 " ~ "0.5",
emp_length == "n/a" ~ "0",
TRUE ~ as.character(emp_length))))
emp_length_bar = ggplot(data=LC.data, aes(y = emp_length, fill= loan_status))+geom_bar(position = "fill") + theme(legend.position = "top")
anyNA(LC.data)
## [1] FALSE
# check correlation between all quantitive futures
loanQuanCorr = cor(LC.data %>% select_if(is.numeric))
corrplot(loanQuanCorr, order ='hclust',type="upper", tl.cex = 0.8)
(highCorr = findCorrelation(loanQuanCorr, 0.7, verbose=T, names =T))
## Compare row 14 and column 13 with corr 0.907
## Means: 0.24 vs 0.13 so flagging column 14
## Compare row 13 and column 17 with corr 0.844
## Means: 0.2 vs 0.122 so flagging column 13
## Compare row 2 and column 3 with corr 0.975
## Means: 0.169 vs 0.114 so flagging column 2
## Compare row 11 and column 9 with corr 0.709
## Means: 0.196 vs 0.107 so flagging column 11
## Compare row 10 and column 18 with corr 0.999
## Means: 0.101 vs 0.1 so flagging column 10
## All correlations <= 0.7
## [1] "revol_util" "bc_util" "sub_grade" "total_acc" "pub_rec"
# remove features that correlation greater than 0.7
LC.data = LC.data %>% select(-c(pub_rec, revol_util, bc_util, sub_grade, total_acc))
remove pub_rec, revol_util, bc_util, sub_grade, total_acc
Loan_amount_den=ggplot(data=LC.data) + geom_density(mapping= aes(x = loan_amnt,fill=loan_status), alpha=0.7) + theme(legend.position = "top")
loan_amount_box = ggplot(data=LC.data, aes( x=loan_status, y=loan_amnt,)) + geom_boxplot()
figure.4 <- grid.arrange( Loan_amount_den,loan_amount_box, ncol = 2)
Charged_off customer usually have higher loan amount.
int_rate_den = ggplot(data=LC.data) + geom_density(mapping= aes(x = int_rate,fill=loan_status), alpha =0.7)
int_rate_box = ggplot(data=LC.data, aes( x=loan_status, y=int_rate,)) + geom_boxplot()
figure.5 <- grid.arrange( int_rate_den,int_rate_box, ncol = 2)
# there are few outliers for customer who fully paid
LC.data = LC.data %>% filter(int_rate <=0.3 )
higher interest rate loan have higher chance turn into charged off loan.
There are few outliers, remove it.
annual_inc_den = ggplot(data=LC.data) + geom_density(mapping= aes(x = annual_inc,fill=loan_status), alpha =0.7)+ theme(legend.position = "top")
annual_inc_box = ggplot(data=LC.data, aes( x=loan_status, y=annual_inc,)) + geom_boxplot()
figure.income <- grid.arrange( annual_inc_den,annual_inc_box, ncol = 2)
summary(LC.data$annual_inc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4000 55000 80000 98866 120000 9100000
quantile(LC.data$annual_inc,c(0.95, 0.975))
## 95% 97.5%
## 213759.5 265000.0
# lots of outliers, remove top 2.5% customer
LC.data = LC.data %>% filter(annual_inc < 189000 )
# After remove outliers
inc_den = ggplot(data=LC.data) + geom_density(mapping= aes(x = annual_inc,fill=loan_status), alpha =0.7)+ theme(legend.position = "top")
inc_box = ggplot(data=LC.data, aes( x=loan_status, y=annual_inc,)) + geom_boxplot() + ggtitle("After Remove Annual Income Outlier")
figure.income_after <- grid.arrange( inc_den,inc_box, ncol = 2)
There are few outliers, annual income is over millions, I remove top 5% quantile of annua income Higher income customer may have slightly higher chance paid off the loan
A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.
dti_den = ggplot(data=LC.data) + geom_density(mapping= aes(x = dti,fill=loan_status), alpha=.5)+ theme(legend.position = "top")
dti_box = ggplot(data=LC.data, aes( x=loan_status, y=dti,)) + geom_boxplot()
figure.7 <- grid.arrange( dti_den,dti_box, ncol = 2)
Charged Off customer usually have higher dti
Balance to credit limit on all trades
all_util_bar = ggplot(data=LC.data) + geom_density(mapping= aes(x = all_util,fill=loan_status), alpha=0.7)+ theme(legend.position = "top")
all_util_box = ggplot(data=LC.data, aes( x=loan_status, y=all_util,)) + geom_boxplot()
figure.8 <- grid.arrange( all_util_bar,all_util_box, ncol = 2)
summary(LC.data$all_util)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3900 0.5500 0.5365 0.6900 1.8500
quantile(LC.data$all_util, 0.975)
## 97.5%
## 0.93
#LC.data = LC.data %>% filter (all_util<=1.2)
fico_density =ggplot(data=LC.data) + geom_density(mapping= aes(x = fico_score,fill=loan_status), alpha = 0.5)+ theme(legend.position = "top")
fico_box =ggplot(data=LC.data, aes( x= loan_status, y=fico_score,)) + geom_boxplot()
figure.fico<- grid.arrange( fico_density,fico_box, ncol = 2)
####cr_year year of borrower’s earliest reported credit line was opened
cr_density=ggplot(data=LC.data) + geom_density(mapping= aes(x = cr_year,fill=loan_status), alpha = 0.5)+ theme(legend.position = "top")
cr_box =ggplot(data=LC.data, aes( x= loan_status, y=cr_year,)) + geom_boxplot()
figure.fico<- grid.arrange( cr_density,cr_box, ncol = 2)
summary(LC.data$cr_year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 12.00 16.00 17.07 20.00 77.00
quantile(LC.data$cr_year, 0.975)
## 97.5%
## 36
LC.data = LC.data %>% filter(LC.data$cr_year<=50)
##percent_bc_gt_75
bc_gt_75_density =ggplot(data=LC.data) + geom_density(mapping= aes(x = percent_bc_gt_75,fill=loan_status), position="dodge", alpha=0.7)+ theme(legend.position = "top")
bc_gt_75_box =ggplot(data=LC.data, aes( x= loan_status, y=percent_bc_gt_75,)) + geom_boxplot()
figure.bc_gt_75 <- grid.arrange( bc_gt_75_density,bc_gt_75_box, ncol = 2)
## Warning: Width not defined. Set with `position_dodge(width = ?)`
open_acc.density = ggplot(data=LC.data) + geom_density(mapping= aes(x = open_acc,fill=loan_status), alpha=.5)+ theme(legend.position = "top")
open_acc.box = ggplot(data=LC.data, aes(x = loan_status, y= open_acc)) + geom_boxplot()
figure.open_acc=grid.arrange( open_acc.density,open_acc.box, ncol = 2)
summary(LC.data$open_acc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 8.00 11.00 11.78 15.00 67.00
quantile(LC.data$open_acc, .975)
## 97.5%
## 27
# remove top 5% quantile open_acc
LC.data = LC.data%>%filter(open_acc<=27)
num_actv.density = ggplot(data=LC.data) + geom_bar(mapping= aes(x = num_actv_rev_tl,fill=loan_status), position="dodge")+ theme(legend.position = "top")
num_actv.box = ggplot(data=LC.data, aes(x = loan_status, y= num_actv_rev_tl)) + geom_boxplot()
figure.actv=grid.arrange( num_actv.density,num_actv.box, ncol = 2)
plot(LC$open_acc, LC$num_actv_rev_tl)
We can remove open_acc
###delinq_2yrs The number of 30+ days past-due incidences of delinquency in the borrower’s credit file for the past 2 years
ggplot(data=LC.data) + geom_bar(mapping= aes(x = delinq_2yrs, fill=loan_status), position="dodge")
# group more than 1 delinquency into one group
LC.data = LC.data %>% mutate(delinq_2yrs = case_when(delinq_2yrs >=1 ~ 1,
delinq_2yrs ==0 ~ 0))
ggplot(data=LC.data) + geom_bar(mapping= aes(x = delinq_2yrs, fill=loan_status), position="fill") + ggtitle("deling_2yrs After Grouping")
After grouping , there is no pattern that number of delinquency for the past 2 year would impact the prediction.
##inq_last_6mths The number of inquiries in past 6 months (excluding auto and mortgage inquiries)
#inq_last_6mths
ggplot(data=LC.data) + geom_bar(mapping= aes(x = inq_last_6mths,fill=loan_status), position = "dodge")+ theme(legend.position = "top")
LC.data= LC.data %>% mutate(inq_last_6mths = case_when(inq_last_6mths >=1 ~ 1,
inq_last_6mths ==0 ~ 0))
ggplot(data=LC.data) + geom_bar(mapping= aes(x = inq_last_6mths,fill=loan_status), position ="fill")+ theme(legend.position = "top")
Number of derogatory public records
ggplot(data=LC.data) + geom_histogram(mapping= aes(x = pub_rec_bankruptcies,fill=loan_status), position="dodge")+ theme(legend.position = "top")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
LC.data= LC.data %>% mutate(pub_rec_bankruptcies = case_when(pub_rec_bankruptcies >=1 ~ 1,
pub_rec_bankruptcies ==0 ~ 0))
ggplot(data=LC.data) + geom_bar(mapping= aes(x = pub_rec_bankruptcies,fill=loan_status), position ="fill") + ggtitle("After Grouping (pub_rec_bankruptcies)")
pub_rec is not important.
Number of mortgage accounts.
mort_bar=ggplot(data=LC.data) + geom_bar(mapping= aes(x = mort_acc,fill=loan_status), postiion="dodge")+ theme(legend.position = "top")
## Warning: Ignoring unknown parameters: postiion
mort_box=ggplot(data=LC.data, aes( x= loan_status, y=mort_acc,)) + geom_boxplot()
mort_ownship_box =ggplot(data=LC.data, aes( x =home_ownership, y=mort_acc,fill=loan_status)) + geom_boxplot(position ="dodge")
figure.mort_acc <- grid.arrange( mort_bar,mort_box, mort_ownship_box, ncol = 2)
# remove mort_acc greater than 8
summary(LC.data$mort_acc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.415 2.000 23.000
quantile(LC.data$mort_acc, 0.95)
## 95%
## 5
#LC.data = LC.data %>% filter(LC.data$mort_acc<8)
##num_accts_ever_120_pd Number of accounts ever 120 or more days past due
ggplot(data=LC.data) + geom_bar(mapping= aes(x = num_accts_ever_120_pd,fill=loan_status))+ theme(legend.position = "top")
LC.data= LC.data %>% mutate(num_accts_ever_120_pd = case_when(num_accts_ever_120_pd >=1 ~ 1,
num_accts_ever_120_pd <1 ~ 0))
ggplot(data=LC.data) + geom_bar(mapping= aes(x = num_accts_ever_120_pd,fill=loan_status), position ="fill")+ theme(legend.position = "top")
Number of accounts opened in past 12 months
tl_open_12m_bar =ggplot(data=LC.data) + geom_bar(mapping= aes(x = num_tl_op_past_12m,fill=loan_status), position="dodge")+ theme(legend.position = "top")
tl_open_12m_box = ggplot(data=LC.data, aes( x= loan_status, y=num_tl_op_past_12m,)) + geom_boxplot()
figure.tl_open <- grid.arrange( tl_open_12m_bar,tl_open_12m_box, ncol = 2)
LC.data = LC.data %>% filter(num_tl_op_past_12m <= 10)
loanQuanCorr = cor(LC.data %>% select_if(is.numeric))
corrplot(loanQuanCorr, order ='hclust',type="upper", tl.cex = 0.8)
(highCorr = findCorrelation(loanQuanCorr, 0.7, verbose=T, names =T))
## All correlations <= 0.7
## character(0)