基于R语言的信用评分卡模型

基于R语言的信用评分卡模型

基于R语言的信用评分卡模型

一 、项目背景

1.1概述

信用评分是指根据客户的信用历史资料,利用一定的信用评分模型,得到不同等级的信用分数。根据客户的信用分数, 授信者可以分析客户按时还款的可能性。据此, 授信者可以决定是否准予授信以及授信的额度和利率。目前信用评分主要使用在一些互联网金融企业和保险银行机构,主要用来解决目前金融机构存在的信用风控问题,常用的模型是逻辑回归。

1.2 数据来源

本项目数据来源于kaggle竞赛Give Me Some Credit,有15万条样本数据。

地址1:kaggle.com/c/GiveMeSome

地址2:pan.baidu.com/s/135XUFm

1.3 目标

构建信用评分卡

二 、数据预处理

2.1数据字典概览


2.2.导入数据

rm(list = ls())
a<-read.csv("D:\\cs-training.csv")
#给列重命名
colnames(a)<-c("id","y","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10")

2.3缺失值处理

#查看缺失
library(colorspace)
library(grid)
library(data.table)
library(VIM)
library(lattice)
library(mice)
md.pattern(a)

缺失主要集中在x5月收入,缺失占比19.8%,和x10家属数量,缺失占比2.6%

#对x5缺失处理
x5<-a$x5
var_x5<-c(
  var="x5",
  mean=mean(x5,na.rm=TRUE) ,   #na.rm=TRUE去除NA的影响
  median=median(x5,na.rm=TRUE) ,
  quantile(x5,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm=TRUE),
  max=max(x5,na.rm=TRUE),
  missing=sum(is.na(x5))
)
View(t(var_x5))
#用mean填补缺失值
a$x5<-ifelse(is.na(a$x5)==T,6670.2,a$x5)
#对x10缺失处理
x10<-a$x10
var_x10<-c(
  var="x10",
  mean=mean(x10,na.rm=TRUE) ,   #na.rm=TRUE去除NA的影响
  median=median(x10,na.rm=TRUE) ,
  quantile(x10,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm=TRUE),
  max=max(x10,na.rm=TRUE),
  missing=sum(is.na(x10))
)
View(t(var_x10))
#用mean填补缺失值
a$x10<-ifelse(is.na(a$x10)==T,0.75,a$x10)

2.4异常值处理

对x1(无担保放款的循环利用)处理

#对x1(无担保放款的循环利用)处理
a$x1<-round(a$x1,2)
x1<-a$x1
var_x1<-c(
  var="x1",
  mean=mean(x1,na.rm=TRUE) ,   #na.rm=TRUE去除NA的影响
  median=median(x1,na.rm=TRUE) ,
  quantile(x1,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.91,0.99,1),na.rm=TRUE),
  max=max(x1,na.rm=TRUE),
  missing=sum(is.na(x1))
)
View(t(var_x1))
sum(a$x1 >1)#3057
boxplot(x1~y,data=a,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
#对x1进行盖帽法处理异常值
block1<-function(x,lower=T,upper=T){
  if(lower){
    q1<-quantile(x,0.01)
    x[x<=q1]<-q1
  }
  if(upper){
    q90<-quantile(x,0.90)
    x[x>q90]<-q90
  }
  return(x)
}
a$x1<-block1(a$x1)  
处理异常值后的箱线图

对x2(age)处理

#对x2(age)处理
unique(a$x2)
sum(a$x2==0)
a<-a[-which(a$x2==0),]#删除年龄为0的异常记录
x2<-a$x2
var_x2<-c(
  var="x2",
  mean=mean(x2,na.rm=TRUE) ,   #na.rm=TRUE去除NA的影响
  median=median(x2,na.rm=TRUE) ,
  quantile(x2,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm=TRUE),
  max=max(x2,na.rm=TRUE),
  missing=sum(is.na(x2))
)
View(t(var_x2))
boxplot(x2~y,data=a,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
#对x2进行盖帽法处理异常值
block2<-function(x,lower=T,upper=T){
  if(lower){
    q1<-quantile(x,0.01)
    x[x<=q1]<-q1
  }
  if(upper){
    q90<-quantile(x,0.90)
    x[x>q90]<-q90
  }
  return(x)
}

a$x2<-block2(a$x2)
处理异常值后的箱线图

对x3(35-59天逾期次数)变量处理

#对x3(35-59天逾期次数)变量处理
unique(a$x3)
x3<-a$x3
var_x3<-c(
  var="x3",
  mean=mean(x3,na.rm=TRUE) ,   #na.rm=TRUE去除NA的影响
  median=median(x3,na.rm=TRUE) ,
  quantile(x3,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm=TRUE),
  max=max(x3,na.rm=TRUE),
  missing=sum(is.na(x3))
)
View(t(var_x3))
boxplot(x3~y,data=a,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
#
#盖帽法
block<-function(x,lower=T,upper=T){
  if(lower){
    q1<-quantile(x,0.01)
    x[x<=q1]<-q1
  }
  if(upper){
    q99<-quantile(x,0.99)
    x[x>q99]<-q99
  }
  return(x)
}
a$x3<-block(a$x3)

对x4(负债比率)变量处理

#对x4(负债比率)变量处理
a$x4<-round(a$x4,2)
sum(a$x4>1)#35078
x4<-a$x4
var_x4<-c(
  var="x4",
  mean=mean(x4,na.rm=TRUE) ,   #na.rm=TRUE去除NA的影响
  median=median(x4,na.rm=TRUE) ,
  quantile(x4,c(0,0.01,0.1,0.25,0.5,0.75,0.76,0.77,0.8,0.85,0.9,0.99,1),na.rm=TRUE),
  max=max(x4,na.rm=TRUE),
  missing=sum(is.na(x4))
)
View(t(var_x4))
boxplot(x4~y,data=a,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
#对x4进行盖帽法处理异常值
block4<-function(x,lower=T,upper=T){
  if(lower){
    q1<-quantile(x,0.01)
    x[x<=q1]<-q1
  }
  if(upper){
    q76<-quantile(x,0.76)
    x[x>q76]<-q76
  }
  return(x)
}
a$x4<-block4(a$x4)

对x5(月收入)处理

#对x5(月收入)处理
x5<-a$x5
var_x5<-c(
  var="x5",
  mean=mean(x5,na.rm=TRUE) ,   #na.rm=TRUE去除NA的影响
  median=median(x5,na.rm=TRUE) ,
  quantile(x5,c(0,0.01,0.1,0.25,0.5,0.75,0.8,0.85,0.9,0.99,1),na.rm=TRUE),
  max=max(x5,na.rm=TRUE),
  missing=sum(is.na(x5))
)
View(t(var_x5))
boxplot(x5~y,data=a,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
#对x5进行盖帽法处理异常值
a$x5<-block(a$x5)

对x6(开放式信贷和贷款数量)处理

#对x6(开放式信贷和贷款数量)处理
x6<-a$x6
var_x6<-c(
  var="x6",
  mean=mean(x6,na.rm=TRUE) ,   #na.rm=TRUE去除NA的影响
  median=median(x6,na.rm=TRUE) ,
  quantile(x6,c(0,0.01,0.1,0.25,0.5,0.75,0.8,0.85,0.9,0.99,1),na.rm=TRUE),
  max=max(x6,na.rm=TRUE),
  missing=sum(is.na(x6))
)
View(t(var_x6))
boxplot(x6~y,data=a,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
#对x6进行盖帽法处理异常值
a$x6<-block(a$x6)

对x7(90天逾期次数)处理

#对x7(90天逾期次数)处理
unique(a$x7)
x7<-a$x7
var_x7<-c(
  var="x7",
  mean=mean(x7,na.rm=TRUE) ,   #na.rm=TRUE去除NA的影响
  median=median(x7,na.rm=TRUE) ,
  quantile(x7,c(0,0.01,0.1,0.25,0.5,0.75,0.8,0.85,0.9,0.99,0.995,0.997,0.998,0.999,1),na.rm=TRUE),
  max=max(x7,na.rm=TRUE),
  missing=sum(is.na(x7))
)
View(t(var_x7))
boxplot(x7~y,data=a,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
#对x7进行盖帽法处理异常值
block7<-function(x,lower=T,upper=T){
  if(lower){
    q1<-quantile(x,0.01)
    x[x<=q1]<-q1
  }
  if(upper){
    q998<-quantile(x,0.998)
    x[x>q998]<-q998
  }
  return(x)
}
a$x7<-block7(a$x7)

对x8(不动产贷款或额度数量)处理

#对x8(不动产贷款或额度数量)处理
unique(a$x8)
x8<-a$x8
var_x8<-c(
  var="x8",
  mean=mean(x8,na.rm=TRUE) ,   #na.rm=TRUE去除NA的影响
  median=median(x8,na.rm=TRUE) ,
  quantile(x8,c(0,0.01,0.1,0.25,0.5,0.75,0.8,0.85,0.9,0.99,1),na.rm=TRUE),
  max=max(x8,na.rm=TRUE),
  missing=sum(is.na(x8))
)
View(t(var_x8))
boxplot(x8~y,data=a,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
#对x8进行盖帽法处理异常值
a$x8<-block(a$x8)

对x9(60-89天逾期但不糟糕次数)处理

#对x9(60-89天逾期但不糟糕次数)处理
unique(a$x9)
x9<-a$x9
var_x9<-c(
  var="x9",
  mean=mean(x9,na.rm=TRUE) ,   #na.rm=TRUE去除NA的影响
  median=median(x9,na.rm=TRUE) ,
  quantile(x9,c(0,0.01,0.1,0.25,0.5,0.75,0.8,0.85,0.9,0.95,0.97,0.99,0.998,0.9986,0.999,1),na.rm=TRUE),
  max=max(x9,na.rm=TRUE),
  missing=sum(is.na(x9))
)
View(t(var_x9))
boxplot(x9~y,data=a,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
#对x9进行盖帽法处理异常值
block9<-function(x,lower=T,upper=T){
  if(lower){
    q1<-quantile(x,0.01)
    x[x<=q1]<-q1
  }
  if(upper){
    q998<-quantile(x,0.998)
    x[x>q998]<-q998
  }
  return(x)
}
a$x9<-block9(a$x9)

#对x10(家属数量)处理

#对x10(家属数量)处理
unique(a$x10)
a$x10<-round(a$x10,0)
x10<-a$x10
var_x10<-c(
  var="x10",
  mean=mean(x10,na.rm=TRUE) ,   #na.rm=TRUE去除NA的影响
  median=median(x10,na.rm=TRUE) ,
  quantile(x10,c(0,0.01,0.1,0.25,0.5,0.75,0.8,0.85,0.9,0.99,1),na.rm=TRUE),
  max=max(x10,na.rm=TRUE),
  missing=sum(is.na(x10))
)
View(t(var_x10))
boxplot(x10~y,data=a,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
#对x10进行异常值chuli
a$x10<-ifelse(a$x10>10,10,a$x10)

2.5变量相关性处理

library(ape)
library(permute)
library(vegan)
library(nlme)
library(picante) 
var_cor<-c("x1","x2","x3","x4","x5","x6","x7","x8","x9","x10")
a_cor<-a[,var_cor]  
cor<-data.frame(cor.table(a_cor,cor.method = 'pearson')) 
View(cor)
无高度相关的变量,无需处理

2.6变量分箱

#最优分箱
library(sqldf)
library(gsubfn)
library(proto)
library(RSQLite)
library(partykit)
library(grid)
library(libcoin)
library(mvtnorm)
library(Formula)
library(smbinning)

#对x1(无担保放款的循环利用)分箱

#对x1(无担保放款的循环利用)分箱
x1_s<-smbinning(a,'y','x1')
x1_s$ivtable
par(mfrow=c(2,2))
smbinning.plot(x1_s,option="dist",sub="x1")
smbinning.plot(x1_s,option="WoE",sub="x1")
smbinning.plot(x1_s,option="goodrate",sub="x1")
smbinning.plot(x1_s,option="badrate",sub="x1")
par(mfrow=c(1,1))
x1_s$iv
a_iv<-c(x1=x1_s$iv)

#对x2(年龄)分箱

#对x2(年龄)分箱
x2_s<-smbinning(a,'y','x2')
x2_s$ivtable
par(mfrow=c(2,2))
smbinning.plot(x2_s,option="dist",sub="x2")
smbinning.plot(x2_s,option="WoE",sub="x2")
smbinning.plot(x2_s,option="goodrate",sub="x2")
smbinning.plot(x2_s,option="badrate",sub="x2")
par(mfrow=c(1,1))
x2_s$iv
a_iv<-c(a_iv,x2=x2_s$iv)


对x3(35-59天逾期次数)分箱

#对x3(35-59天逾期次数)分箱
unique(a$x3)
x3_s<-smbinning(a,'y','x3')
x3_s$ivtable
par(mfrow=c(2,2))
smbinning.plot(x3_s,option="dist",sub="x3")
smbinning.plot(x3_s,option="WoE",sub="x3")
smbinning.plot(x3_s,option="goodrate",sub="x3")
smbinning.plot(x3_s,option="badrate",sub="x3")
par(mfrow=c(1,1))
x3_s$iv
a_iv<-c(a_iv,x3=x3_s$iv)

对x4(负债比率)分箱

#对x4(负债比率)分箱
x4_s<-smbinning(a,'y','x4')
x4_s$ivtable
par(mfrow=c(2,2))
smbinning.plot(x4_s,option="dist",sub="x4")
smbinning.plot(x4_s,option="WoE",sub="x4")
smbinning.plot(x4_s,option="goodrate",sub="x4")
smbinning.plot(x4_s,option="badrate",sub="x4")
par(mfrow=c(1,1))
x4_s$iv
a_iv<-c(a_iv,x4=x4_s$iv)

对x5(月收入)分箱

#对x5(月收入)分箱
x5_s<-smbinning(a,'y','x5')
x5_s$ivtable
par(mfrow=c(2,2))
smbinning.plot(x5_s,option="dist",sub="x5")
smbinning.plot(x5_s,option="WoE",sub="x5")
smbinning.plot(x5_s,option="goodrate",sub="x5")
smbinning.plot(x5_s,option="badrate",sub="x5")
par(mfrow=c(1,1))
x5_s$iv
a_iv<-c(a_iv,x5=x5_s$iv)

#对x6(开放式信贷和贷款数量)分箱

#对x6(开放式信贷和贷款数量)分箱
x6_s<-smbinning(a,'y','x6')
x6_s$ivtable
par(mfrow=c(2,2))
smbinning.plot(x6_s,option="dist",sub="x6")
smbinning.plot(x6_s,option="WoE",sub="x6")
smbinning.plot(x6_s,option="goodrate",sub="x6")
smbinning.plot(x6_s,option="badrate",sub="x6")
par(mfrow=c(1,1))
x6_s$iv
a_iv<-c(a_iv,x6=x6_s$iv)

对x7(90天逾期次数)分箱

#对x7(90天逾期次数)分箱
x7_s<-smbinning(a,'y','x7')
x7_s$ivtable
par(mfrow=c(2,2))
smbinning.plot(x7_s,option="dist",sub="x7")
smbinning.plot(x7_s,option="WoE",sub="x7")
smbinning.plot(x7_s,option="goodrate",sub="x7")
smbinning.plot(x7_s,option="badrate",sub="x7")
par(mfrow=c(1,1))
x7_s$iv
a_iv<-c(a_iv,x7=x7_s$iv)

#对x8(不动产贷款或额度数量)分箱

#对x8(不动产贷款或额度数量)分箱
x8_s<-smbinning(a,'y','x8')
x8_s$ivtable
par(mfrow=c(2,2))
smbinning.plot(x8_s,option="dist",sub="x8")
smbinning.plot(x8_s,option="WoE",sub="x8")
smbinning.plot(x8_s,option="goodrate",sub="x8")
smbinning.plot(x8_s,option="badrate",sub="x8")
par(mfrow=c(1,1))
x8_s$iv
a_iv<-c(a_iv,x8=x8_s$iv)

对x9(60-89天逾期但不糟糕次数)分箱

#对x9(60-89天逾期但不糟糕次数)分箱
x9_s<-smbinning(a,'y','x9')
x9_s$ivtable
par(mfrow=c(2,2))
smbinning.plot(x9_s,option="dist",sub="x9")
smbinning.plot(x9_s,option="WoE",sub="x9")
smbinning.plot(x9_s,option="goodrate",sub="x9")
smbinning.plot(x9_s,option="badrate",sub="x9")
par(mfrow=c(1,1))
x9_s$iv
a_iv<-c(a_iv,x9=x9_s$iv)

x10(家属数量)分箱

#x10(家属数量)分箱
x10_s<-smbinning(a,'y','x10')
x10_s$ivtable
par(mfrow=c(2,2))
smbinning.plot(x10_s,option="dist",sub="x10")
smbinning.plot(x10_s,option="WoE",sub="x10")
smbinning.plot(x10_s,option="goodrate",sub="x10")
smbinning.plot(x10_s,option="badrate",sub="x10")
par(mfrow=c(1,1))
x10_s$iv
a_iv<-c(a_iv,x10=x10_s$iv)

对y处理

#对y处理
a$y<-as.numeric(!as.logical(a$y))

iv值

barplot(a_iv,main="各变量信息值",)

#分箱后的新列

#分箱后的新列
a_s<-a
View(a_s)
a_s<-smbinning.gen(a_s,x1_s,'x1_s')
a_s<-smbinning.gen(a_s,x2_s,'x2_s')
a_s<-smbinning.gen(a_s,x3_s,'x3_s')
a_s<-smbinning.gen(a_s,x4_s,'x4_s')
a_s<-smbinning.gen(a_s,x5_s,'x5_s')
a_s<-smbinning.gen(a_s,x6_s,'x6_s')
a_s<-smbinning.gen(a_s,x7_s,'x7_s')
a_s<-smbinning.gen(a_s,x8_s,'x8_s')
a_s<-smbinning.gen(a_s,x9_s,'x9_s')
a_s<-smbinning.gen(a_s,x10_s,'x10_s')
a_ss<-a_s[,c(1:2,13:22)]
View(a_ss)

三、建模

3.1 建立逻辑回归模型

a_ss_glm<-glm(y~.,data = a_ss[,-1],family = binomial())
summary(a_ss_glm)

3.2 vif检验

#vif检验
library(carData)
library(car)
vif(a_ss_glm)
均小于2,无需处理

四、构建评分卡

4.1 对模型的变量进行打分

#对模型的变量进行打分
#score=A-B*log(odds)
a_ss_score<-smbinning.scaling(a_ss_glm,pdo=45,score=800,odds=50)
a_ss_score$minmaxscore#360 890
a_ss_score$logitscaled

4.2 生成每行对应的分数

#生成每行对应的分数
a_score<-smbinning.scoring.gen(smbscaled=a_ss_score, dataset=a_ss)
boxplot(Score~y,data=a_score,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")

4.3 画ROC曲线

smbinning.metrics(a_score,"Score","y",plot="auc")#85.94

4.4 画ks曲线

smbinning.metrics(a_score,"Score","y",plot="ks")#56.06

4.5 生成评分卡并导出

scaledcard<-a_ss_score$logitscaled[[1]][-1,c(1,2,6)]
scaledcard
scaledcard[,1]<-c(rep("x1",8),rep("x2",9),
                  rep("x3",3),rep("x4",5),rep("x5",5),
                  rep("x6",5),rep('x7',2),rep('x8',4),rep('x9',2),rep('x10',4))
write.csv(scaledcard,"D:\\score.csv",row.names = F)

4.6 个人评分计算案例

分数越高,违约可能性越小

编辑于 2018-09-22 23:02