data.combined <- read.csv("loan.csv", header = TRUE)
data.combined.BackUp <- data.combined
str(data.combined)
###############################################################################################
#GENERAL OVERVIEW
#from issue_d I extrapolate the year
year_loan = as.character(data.combined$issue_d)
unique(year_loan)
library(stringr)
year_loan = strsplit(year_loan, "-")
year_loan = sapply(year_loan, "[", 2)
data.combined$year_loan = as.factor(year_loan)
#some general graphs to have an overview
#absolute number of loans by year
library(ggplot2)
ggplot(data.combined, aes(x=year_loan)) +
geom_bar(fill="steelblue") +
ggtitle("Absolute number of loans by year")
#the absolute number of loans is increasing across years
#total amount given as a loan by year
library(plyr)
tot_loan_year = ddply(data.combined, .(year_loan), summarise, tot_loan = sum(loan_amnt))
tot_loan_year
ggplot(tot_loan_year, aes(x=year_loan, y=tot_loan)) +
geom_bar(stat = "identity", fill="steelblue") +
ggtitle("Money given as a loan by year")
#little differences: let's see how average moeny loned is changed across years
avg_loan_year = ddply(data.combined, .(year_loan), summarise, avg_loan = mean(loan_amnt))
avg_loan_year
ggplot(avg_loan_year, aes(x=year_loan, y=avg_loan)) +
geom_bar(stat="identity", fill="steelblue") +
ggtitle("Average moeny loned is changed across years")
#we couldn't infer it from the first two graphs but avg money are increasing
#Avg interest rate across years
int_rate_year = ddply(data.combined, .(year_loan), summarise, avg_int_rate=mean(int_rate))
int_rate_year
ggplot(int_rate_year, aes(x=year_loan, y=avg_int_rate)) +
geom_bar(stat="identity", fill="steelblue") +
ggtitle("Avg interest rate across years")
#on average interest rate are decresing from 2013 -> it might explain partially why loans are increasing
#let's compare three distribution: money asked, money loned, money loned by investors
x = data.frame(asked=data.combined$loan_amnt, given=data.combined$funded_amnt, investor=data.combined$funded_amnt_inv)
library(reshape2)
data <- melt(x) #ignor the warning
ggplot(data, aes(x=variable, y=value, fill=variable)) + geom_boxplot() +ggtitle("Distribution")
ggplot(data, aes(x=value, fill=variable)) + geom_density(alpha=1) +ggtitle("Amount function by year") #we can't see the graph because they overlap each other
#since they are basically the same, let's take money asked and let's see how the distribution changes across years
ggplot(data.combined, aes(x=loan_amnt)) + geom_density() + facet_wrap("year_loan")
#among years they started to provide less loans of few money and more loans with a lot of money (more diversified as initially) -> test: is it a good choice to increase the prob of money back?
ggplot(data.combined, aes(x=loan_amnt)) + geom_histogram(fill="steelblue", binwidth = 1000)
#most of the loans are concentrated around 10k-20k
ggplot(data.combined, aes(x=loan_amnt)) + geom_histogram(fill="steelblue", binwidth = 1000) + facet_wrap("year_loan")
#in the last two years they started to loan more money, even for >20k (before less common) -> 2014 and 2015 governs the trend
###############################################################################################
# GOOD LOANS VS BAD LOANS
loans = as.factor(data.combined$loan_status)
unique(loans)
table(loans)
ggplot(data.combined, aes(x=loan_status)) + geom_bar(width = 1, fill="steelblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#we can group loans in three cateogories: gl (good_loan), bl (bad_loan) and tbd (to be defined)
levels(loans) = c("bl", "tbd", "bl", "bl", "gl", "gl", "bl", "tbd", "bl", "bl")
table(loans)
#let's put aside for now tbd rows in order to look for the different characteristics of bl and gl
data.combined$status = loans
data.combined2 = data.combined[which(data.combined$status != "tbd"),]
table(data.combined2$status)
levels(data.combined2$status) = c("bl", "bl", "gl")
#distribution of loan_amnt bl-gl
#test: % of bad loans if the loan_amnt is in the range ("<10k","10k-20k", "20k-30k", ">30k" )
loans10 = data.combined2[which(data.combined2$loan_amnt < 10000), "loan_amnt"]
loans1020 = data.combined2[which(data.combined2$loan_amnt >= 10000 & data.combined2$loan_amnt < 20000), "loan_amnt"]
loans2030 = data.combined2[which(data.combined2$loan_amnt >= 20000 & data.combined2$loan_amnt < 30000), "loan_amnt"]
loans30 = data.combined2[which(data.combined2$loan_amnt >= 30000), "loan_amnt"]
somme = c(sum(loans10), sum(loans1020), sum(loans2030), sum(loans30))
nomi_somme = c("<10", "10-20", "20-30", "30+")
df.somme = data.frame(somme, nomi_somme)
ggplot(df.somme, aes(x=nomi_somme, y=somme)) + geom_bar(stat="identity", fill="steelblue")
library(plyr)
tot_loan_status = ddply(data.combined2, .(status), summarise, tot_loan = sum(loan_amnt))
tot_loan_status
ggplot(tot_loan_status, aes(x=status, y=tot_loan, fill=status)) + geom_bar(stat="identity")
ggplot(data.combined2, aes(x=loan_amnt, fill=status)) + geom_density()
#the frequency of bad loans increases with the loan_amnt (because it's more difficult to pay back)
#-> does it worth to increase the avg_loan_amount as seen in the preivous graph (across years)?
ggplot(data.combined2[which(data.combined2$status == "gl"),], aes(x=loan_amnt)) + geom_density()
ggplot(data.combined2[which(data.combined2$status == "bl"),], aes(x=loan_amnt)) + geom_density()
ggplot(data.combined2, aes(x=loan_amnt, fill=status)) + geom_density() + facet_wrap("status")
#what is the value of tot gl e tot bl?
tot = count(data.combined2$status)
tot_gl = sum(data.combined2[which(data.combined2$status == "gl"), "loan_amnt"], na.rm = TRUE)
tot_bl = sum(data.combined2[which(data.combined2$status == "bl"), "loan_amnt"], na.rm = TRUE)
ggplot(data.combined2, aes(x=loan_amnt, fill=status)) + geom_histogram(binwidth = 1000)
ggplot(data.combined2, aes(x=year_loan, y=loan_amnt, fill=status)) + geom_bar(stat="identity")
#the proportion of bad loans is increasing across years -> we must understand why
#bl-gl distribution of the interest rate across year
#interest rate must be grouped because there are too many
ggplot(data.combined2, aes(x=int_rate)) + geom_density()
ggplot(data.combined2, aes(x=int_rate, fill=status)) + geom_density()
data.combined2$New_int_rate = cut(data.combined2$int_rate, seq(0, 30, 5), right = FALSE)
ggplot(data.combined2, aes(x=New_int_rate, fill=status)) + geom_bar()
#avg interest rate for gl and bl
avg_int_rate_gl = mean(data.combined2[which(data.combined2$status == "gl"), "int_rate"])
avg_int_rate_bl = mean(data.combined2[which(data.combined2$status == "bl"), "int_rate"])
#variables that define the economic stability of the borrower (income, house, job)
#annual income
ggplot(data.combined2[which(data.combined2$annual_inc <=90000),], aes(x=annual_inc, fill=status)) + geom_histogram(binwidth = 1000)
ggplot(data.combined2[which(data.combined2$annual_inc <=90000),], aes(x=annual_inc, fill=status)) + geom_density()
#house
table(data.combined2$home_ownership)
#let's see only mortage, own, rent
ggplot(data.combined2[which(data.combined2$home_ownership != "NONE" & data.combined2$home_ownership != "ANY" & data.combined2$home_ownership != "OTHER" ),], aes(x=home_ownership, fill=status)) + geom_bar()
#own has significantly less bl (as aspected) while there's no difference between rent and mortage
ggplot(data.combined2[which(data.combined2$home_ownership != "NONE" & data.combined2$home_ownership != "ANY" & data.combined2$home_ownership != "OTHER" ),], aes(x=home_ownership, y= loan_amnt, fill=status)) + geom_bar(stat="identity")
#job length
table(data.combined2$emp_length)
ggplot(data.combined2, aes(x=emp_length, fill=status))
+ geom_bar() + theme(axis.text.x = element_text(angle = 45, hjust = 1))
#it doesn't seem to have a strong impact
#title -> there are a lot of factors: take only bad loans and list the top 10 categories (and see the proporton on total - maybe Pareto law)
length(unique(data.combined2$title))
head(data.combined2$title)
title.frequent = data.frame(sort(table(data.combined2$title),decreasing=TRUE)[1:15])
sum.title.frequent = sum(title.frequent$Freq)
pareto_limit = sum.title.frequent/nrow(data.combined2)
pareto_limit
title.frequent.name = title.frequent$Var1
title.frequent.name
#what I can immediately see is that there are different ways of saying Debt Consolidation. Let's adjust the most frequent items till I don't have at least 10 different items
titles = data.combined2$title
titles[titles %in% c("Debt consolidation", "Debt Consolidation", "debt consolidation", "Debt Consolidation Loan", "debt consolidation loan")] <- "Debit Consolidation"
titles[titles %in% c("Consolidation Loan", "consolidation", "Consolidation")] <- "Consolidation"
titles[titles %in% c("Home improvement", "Home Improvement")] <- "Home"
titles[titles %in% c("Credit card refinancing", "Credit Card Refinance")] <- "Credit Card Refinance"
data.combined2$title = titles
ggplot(data.combined2[which(data.combined2$title %in% title.frequent.name),], aes(x=title, fill=status)) +
geom_bar(width = 0.5) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(data.combined2[which(data.combined2$title %in% title.frequent.name & data.combined2$title != "Debit Consolidation" & data.combined2$title != "Credit Card Refinance"),], aes(x=title, fill=status)) +
geom_bar(width = 0.5) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
#predict current loan if they will be bad or not (create a variable similar to Survived and test it on current) + cross_validation
#NA values ripasso + tutte le funzioni principali (manipolazione stringhe e grafici)
#additional task:
#1) read the considerations in each kernel about this dataset
#2) see what are the common format (in addition to .csv) and how to read it
#3) definire un approccio strutturato di analisi
data.combined.BackUp <- data.combined
str(data.combined)
###############################################################################################
#GENERAL OVERVIEW
#from issue_d I extrapolate the year
year_loan = as.character(data.combined$issue_d)
unique(year_loan)
library(stringr)
year_loan = strsplit(year_loan, "-")
year_loan = sapply(year_loan, "[", 2)
data.combined$year_loan = as.factor(year_loan)
#some general graphs to have an overview
#absolute number of loans by year
library(ggplot2)
ggplot(data.combined, aes(x=year_loan)) +
geom_bar(fill="steelblue") +
ggtitle("Absolute number of loans by year")
#the absolute number of loans is increasing across years
#total amount given as a loan by year
library(plyr)
tot_loan_year = ddply(data.combined, .(year_loan), summarise, tot_loan = sum(loan_amnt))
tot_loan_year
ggplot(tot_loan_year, aes(x=year_loan, y=tot_loan)) +
geom_bar(stat = "identity", fill="steelblue") +
ggtitle("Money given as a loan by year")
#little differences: let's see how average moeny loned is changed across years
avg_loan_year = ddply(data.combined, .(year_loan), summarise, avg_loan = mean(loan_amnt))
avg_loan_year
ggplot(avg_loan_year, aes(x=year_loan, y=avg_loan)) +
geom_bar(stat="identity", fill="steelblue") +
ggtitle("Average moeny loned is changed across years")
#we couldn't infer it from the first two graphs but avg money are increasing
#Avg interest rate across years
int_rate_year = ddply(data.combined, .(year_loan), summarise, avg_int_rate=mean(int_rate))
int_rate_year
ggplot(int_rate_year, aes(x=year_loan, y=avg_int_rate)) +
geom_bar(stat="identity", fill="steelblue") +
ggtitle("Avg interest rate across years")
#on average interest rate are decresing from 2013 -> it might explain partially why loans are increasing
#let's compare three distribution: money asked, money loned, money loned by investors
x = data.frame(asked=data.combined$loan_amnt, given=data.combined$funded_amnt, investor=data.combined$funded_amnt_inv)
library(reshape2)
data <- melt(x) #ignor the warning
ggplot(data, aes(x=variable, y=value, fill=variable)) + geom_boxplot() +ggtitle("Distribution")
ggplot(data, aes(x=value, fill=variable)) + geom_density(alpha=1) +ggtitle("Amount function by year") #we can't see the graph because they overlap each other
#since they are basically the same, let's take money asked and let's see how the distribution changes across years
ggplot(data.combined, aes(x=loan_amnt)) + geom_density() + facet_wrap("year_loan")
#among years they started to provide less loans of few money and more loans with a lot of money (more diversified as initially) -> test: is it a good choice to increase the prob of money back?
ggplot(data.combined, aes(x=loan_amnt)) + geom_histogram(fill="steelblue", binwidth = 1000)
#most of the loans are concentrated around 10k-20k
ggplot(data.combined, aes(x=loan_amnt)) + geom_histogram(fill="steelblue", binwidth = 1000) + facet_wrap("year_loan")
#in the last two years they started to loan more money, even for >20k (before less common) -> 2014 and 2015 governs the trend
###############################################################################################
# GOOD LOANS VS BAD LOANS
loans = as.factor(data.combined$loan_status)
unique(loans)
table(loans)
ggplot(data.combined, aes(x=loan_status)) + geom_bar(width = 1, fill="steelblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#we can group loans in three cateogories: gl (good_loan), bl (bad_loan) and tbd (to be defined)
levels(loans) = c("bl", "tbd", "bl", "bl", "gl", "gl", "bl", "tbd", "bl", "bl")
table(loans)
#let's put aside for now tbd rows in order to look for the different characteristics of bl and gl
data.combined$status = loans
data.combined2 = data.combined[which(data.combined$status != "tbd"),]
table(data.combined2$status)
levels(data.combined2$status) = c("bl", "bl", "gl")
#distribution of loan_amnt bl-gl
#test: % of bad loans if the loan_amnt is in the range ("<10k","10k-20k", "20k-30k", ">30k" )
loans10 = data.combined2[which(data.combined2$loan_amnt < 10000), "loan_amnt"]
loans1020 = data.combined2[which(data.combined2$loan_amnt >= 10000 & data.combined2$loan_amnt < 20000), "loan_amnt"]
loans2030 = data.combined2[which(data.combined2$loan_amnt >= 20000 & data.combined2$loan_amnt < 30000), "loan_amnt"]
loans30 = data.combined2[which(data.combined2$loan_amnt >= 30000), "loan_amnt"]
somme = c(sum(loans10), sum(loans1020), sum(loans2030), sum(loans30))
nomi_somme = c("<10", "10-20", "20-30", "30+")
df.somme = data.frame(somme, nomi_somme)
ggplot(df.somme, aes(x=nomi_somme, y=somme)) + geom_bar(stat="identity", fill="steelblue")
library(plyr)
tot_loan_status = ddply(data.combined2, .(status), summarise, tot_loan = sum(loan_amnt))
tot_loan_status
ggplot(tot_loan_status, aes(x=status, y=tot_loan, fill=status)) + geom_bar(stat="identity")
ggplot(data.combined2, aes(x=loan_amnt, fill=status)) + geom_density()
#the frequency of bad loans increases with the loan_amnt (because it's more difficult to pay back)
#-> does it worth to increase the avg_loan_amount as seen in the preivous graph (across years)?
ggplot(data.combined2[which(data.combined2$status == "gl"),], aes(x=loan_amnt)) + geom_density()
ggplot(data.combined2[which(data.combined2$status == "bl"),], aes(x=loan_amnt)) + geom_density()
ggplot(data.combined2, aes(x=loan_amnt, fill=status)) + geom_density() + facet_wrap("status")
#what is the value of tot gl e tot bl?
tot = count(data.combined2$status)
tot_gl = sum(data.combined2[which(data.combined2$status == "gl"), "loan_amnt"], na.rm = TRUE)
tot_bl = sum(data.combined2[which(data.combined2$status == "bl"), "loan_amnt"], na.rm = TRUE)
ggplot(data.combined2, aes(x=loan_amnt, fill=status)) + geom_histogram(binwidth = 1000)
ggplot(data.combined2, aes(x=year_loan, y=loan_amnt, fill=status)) + geom_bar(stat="identity")
#the proportion of bad loans is increasing across years -> we must understand why
#bl-gl distribution of the interest rate across year
#interest rate must be grouped because there are too many
ggplot(data.combined2, aes(x=int_rate)) + geom_density()
ggplot(data.combined2, aes(x=int_rate, fill=status)) + geom_density()
data.combined2$New_int_rate = cut(data.combined2$int_rate, seq(0, 30, 5), right = FALSE)
ggplot(data.combined2, aes(x=New_int_rate, fill=status)) + geom_bar()
#avg interest rate for gl and bl
avg_int_rate_gl = mean(data.combined2[which(data.combined2$status == "gl"), "int_rate"])
avg_int_rate_bl = mean(data.combined2[which(data.combined2$status == "bl"), "int_rate"])
#variables that define the economic stability of the borrower (income, house, job)
#annual income
ggplot(data.combined2[which(data.combined2$annual_inc <=90000),], aes(x=annual_inc, fill=status)) + geom_histogram(binwidth = 1000)
ggplot(data.combined2[which(data.combined2$annual_inc <=90000),], aes(x=annual_inc, fill=status)) + geom_density()
#house
table(data.combined2$home_ownership)
#let's see only mortage, own, rent
ggplot(data.combined2[which(data.combined2$home_ownership != "NONE" & data.combined2$home_ownership != "ANY" & data.combined2$home_ownership != "OTHER" ),], aes(x=home_ownership, fill=status)) + geom_bar()
#own has significantly less bl (as aspected) while there's no difference between rent and mortage
ggplot(data.combined2[which(data.combined2$home_ownership != "NONE" & data.combined2$home_ownership != "ANY" & data.combined2$home_ownership != "OTHER" ),], aes(x=home_ownership, y= loan_amnt, fill=status)) + geom_bar(stat="identity")
#job length
table(data.combined2$emp_length)
ggplot(data.combined2, aes(x=emp_length, fill=status))
+ geom_bar() + theme(axis.text.x = element_text(angle = 45, hjust = 1))
#it doesn't seem to have a strong impact
#title -> there are a lot of factors: take only bad loans and list the top 10 categories (and see the proporton on total - maybe Pareto law)
length(unique(data.combined2$title))
head(data.combined2$title)
title.frequent = data.frame(sort(table(data.combined2$title),decreasing=TRUE)[1:15])
sum.title.frequent = sum(title.frequent$Freq)
pareto_limit = sum.title.frequent/nrow(data.combined2)
pareto_limit
title.frequent.name = title.frequent$Var1
title.frequent.name
#what I can immediately see is that there are different ways of saying Debt Consolidation. Let's adjust the most frequent items till I don't have at least 10 different items
titles = data.combined2$title
titles[titles %in% c("Debt consolidation", "Debt Consolidation", "debt consolidation", "Debt Consolidation Loan", "debt consolidation loan")] <- "Debit Consolidation"
titles[titles %in% c("Consolidation Loan", "consolidation", "Consolidation")] <- "Consolidation"
titles[titles %in% c("Home improvement", "Home Improvement")] <- "Home"
titles[titles %in% c("Credit card refinancing", "Credit Card Refinance")] <- "Credit Card Refinance"
data.combined2$title = titles
ggplot(data.combined2[which(data.combined2$title %in% title.frequent.name),], aes(x=title, fill=status)) +
geom_bar(width = 0.5) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(data.combined2[which(data.combined2$title %in% title.frequent.name & data.combined2$title != "Debit Consolidation" & data.combined2$title != "Credit Card Refinance"),], aes(x=title, fill=status)) +
geom_bar(width = 0.5) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
#predict current loan if they will be bad or not (create a variable similar to Survived and test it on current) + cross_validation
#NA values ripasso + tutte le funzioni principali (manipolazione stringhe e grafici)
#additional task:
#1) read the considerations in each kernel about this dataset
#2) see what are the common format (in addition to .csv) and how to read it
#3) definire un approccio strutturato di analisi
Nessun commento:
Posta un commento