Fork me on GitHub

travis-status Coverage Status

Introduction

The riskr package facilitate credit scoring tasks such as measure the scores/models performance and make easy the scoring modelling process.

There are function to:

  1. Measure in a simple way the performance of models via wrappers/shortcuts from ROCR functions.
  2. Visualize relationships between variables.
  3. Compute usual values in the credit scoring PSI, WOE, IV, KS, AUCROC, among others.
  4. Make easier the modelling and validation process.

Assumptions

riskr assume the target variable is binary with numeric values: 0 and 1. Usually 1 means the characteristic of interest.

Installation

You can install the latest development version from github with:

devtools::install_github("jbkunst/riskr")

Functions

Usually we have a data frame with a target variable and a score (or probability) like this:

library("riskr")

data("predictions")

head(predictions)
score target
0.202 1
0.806 1
0.513 1
0.052 0
0.329 1
0.246 0

str(predictions)
## 'data.frame':    10000 obs. of  2 variables:
##  $ score : num  0.2023 0.8058 0.5134 0.0525 0.3288 ...
##  $ target: num  1 1 1 0 1 0 1 0 1 1 ...

score <- predictions$score

target <- predictions$target

Performance Indicators

The main statistics or indicators are KS, AUCROC so:

ks(score, target)
## [1] 0.254

aucroc(score, target)
## [1] 0.676

gini(score, target)
## [1] 0.353

perf(score, target)
count target_count target_rate ks aucroc gini divergence
10000 6990 0.699 0.254 0.676 0.353 0.408

There are some functions to plot the score/model performance (based on ggplot package).

plot_roc(score, target)


plot_gain(score, target)


plot_lift(score, target)


plot_dists(score, target) +
  ggtitle(sprintf("The KS statistics is %.0f%%", 100 * ks(score, target)))


plot_ks(score, target) + ggtitle("Comparing ECDF")

plot_perf(score, target)

Odds Tables

The odds tables are other way to show how a score/model performs.

score <- round(predictions$score * 1000)

odds_table(score, target, nclass = 5) # default is (nclass =) 10 groups of equal size
class count percent target_count target_rate target_percent non_target_count non_target_percent odds woe iv
[1,164] 2009 0.201 1010 0.503 0.144 999 0.332 1.01 -0.832 0.156
(164,331] 1991 0.199 1255 0.630 0.180 736 0.245 1.71 -0.309 0.020
(331,526] 2008 0.201 1429 0.712 0.204 579 0.192 2.47 0.061 0.001
(526,738] 2000 0.200 1573 0.786 0.225 427 0.142 3.68 0.461 0.038
(738,996] 1992 0.199 1723 0.865 0.246 269 0.089 6.41 1.015 0.159

odds_table(score, target, breaks = c(0, 300, 700, 999))
class count percent target_count target_rate target_percent non_target_count non_target_percent odds woe iv
(0,300] 3675 0.368 2052 0.558 0.294 1623 0.539 1.26 -0.608 0.149
(300,700] 3978 0.398 2926 0.736 0.419 1052 0.350 2.78 0.180 0.012
(700,999] 2347 0.235 2012 0.857 0.288 335 0.111 6.01 0.950 0.168

Confusion Matrix

The conf_matrix function return a list with the next elements:

score_cat <- ifelse(score < 500, 0, 1)

cm <- conf_matrix(score_cat, target)
  • The confusion matrix:
cm$confusion.matrix
class pred 0 pred 1
0 true 0 2230 780
1 true 1 3476 3514
  • The confusion matrix statistics
cm$indicators
term term.short value
Accuracy AC 0.574
Recall | True Positive rate (GG) Recall 0.503
False Positive rate FP 0.259
True Negative rate (BB) TN 0.741
False Negative rate FN 0.497
Precision P 0.818

cm$indicators.t
AC Recall FP TN FN P
0.574 0.503 0.259 0.741 0.497 0.818

Bivariate Tables

data("credit")

str(credit)
## 'data.frame':    49694 obs. of  17 variables:
##  $ id_client          : int  1 7 9 12 14 19 22 26 28 30 ...
##  $ sex                : chr  "F" "F" "F" "F" ...
##  $ marital_status     : chr  "O" "S" "S" "C" ...
##  $ age                : int  44 22 27 32 36 46 17 20 71 46 ...
##  $ flag_res_phone     : chr  "N" "Y" "Y" "Y" ...
##  $ area_code_res_phone: int  31 31 31 31 31 50 50 50 31 31 ...
##  $ payment_day        : int  12 12 20 12 12 12 12 12 18 8 ...
##  $ residence_type     : chr  "P" "A" "A" "P" ...
##  $ months_in_residence: int  12 0 0 24 120 360 12 12 96 72 ...
##  $ months_in_the_job  : int  48 48 0 0 36 120 12 24 12 12 ...
##  $ profession_code    : int  731 999 950 165 15 704 38 39 13 801 ...
##  $ flag_other_card    : chr  "N" "N" "N" "N" ...
##  $ flag_mobile_phone  : chr  "N" "N" "N" "N" ...
##  $ flag_contact_phone : chr  "N" "N" "N" "N" ...
##  $ personal_net_income: num  300 410 1000 700 1987 ...
##  $ quant_add_cards    : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ bad                : int  0 0 1 0 0 0 1 1 0 0 ...

ft(credit$marital_status)
class count percent
S 25249 0.508
C 17097 0.344
O 2776 0.056
V 2430 0.049
D 2142 0.043

bt(credit$marital_status, credit$bad)
class count percent target_count target_rate target_percent non_target_count non_target_percent odds woe iv
C 17097 0.344 2483 0.145 0.253 14614 0.366 0.170 -0.370 0.042
D 2142 0.043 322 0.150 0.033 1820 0.046 0.177 -0.330 0.004
O 2776 0.056 660 0.238 0.067 2116 0.053 0.312 0.237 0.003
S 25249 0.508 6059 0.240 0.617 19190 0.481 0.316 0.249 0.034
V 2430 0.049 289 0.119 0.029 2141 0.054 0.135 -0.600 0.015

library("ggplot2")

credit$age_bin <- cut_interval(credit$age, 3)

bt(credit$age_bin, credit$bad)
class count percent target_count target_rate target_percent non_target_count non_target_percent odds woe iv
[15,41.7] 35654 0.717 8174 0.229 0.833 27480 0.689 0.297 0.190 0.027
(41.7,68.3] 13313 0.268 1585 0.119 0.162 11728 0.294 0.135 -0.599 0.079
(68.3,95] 727 0.015 54 0.074 0.006 673 0.017 0.080 -1.121 0.013
plot_ba(credit$age_bin, credit$bad)


# order by odds
lvls_rt <- bt(credit$residence_type, credit$bad) %>% 
  arrange(desc(odds)) %>%  .[["class"]]

residence_type2 <- factor(credit$residence_type, levels = lvls_rt)

plot_ba(residence_type2, credit$bad)

The minified version of plot_ba

plot_ba2(credit$age_bin, credit$bad) + ggtitle("Age")



plot_ba2(credit$flag_res_phone, credit$bad, labels = FALSE) +
  ggtitle("Flag Response Phone")

Session Info

library("riskr")
library("printr")
library("ggplot2")
library("ggthemes")
options(digits = 3, knitr.table.format = "markdown")
knitr::opts_chunk$set(collapse = TRUE, warning = FALSE,
                      fig.path = "vignettes/figures/",
                      fig.width = 6, fig.height = 6,
                      fig.align = "center", dpi = 72)

theme_set(theme_fivethirtyeight(base_size = 11) +
            theme(axis.title = element_text(colour = "grey30"),
                  axis.title.y = element_text(angle = 90),
                  strip.background = element_rect(fill = "#434348"),
                  strip.text = element_text(color = "#F0F0F0"),
                  plot.title = element_text(face = "plain", size = structure(1.2, class = "rel"))))
update_geom_defaults("line", list(colour = "#434348", size = 1.05))
update_geom_defaults("point", list(colour = "#434348", size = 3))
update_geom_defaults("bar", list(fill = "#7cb5ec"))
update_geom_defaults("text", list(size = 4, colour = "gray30"))
print(sessionInfo())
## R version 3.2.0 (2015-04-16)
## Platform: i386-w64-mingw32/i386 (32-bit)
## Running under: Windows 7 (build 7601) Service Pack 1
## 
## locale:
## [1] LC_COLLATE=Spanish_Chile.1252  LC_CTYPE=Spanish_Chile.1252   
## [3] LC_MONETARY=Spanish_Chile.1252 LC_NUMERIC=C                  
## [5] LC_TIME=Spanish_Chile.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tidyr_0.2.0    plyr_1.8.3     dplyr_0.4.1    scales_0.2.5  
##  [5] ROCR_1.0-7     gplots_2.17.0  ggthemes_2.2.1 ggplot2_1.0.1 
##  [9] printr_0.0.4   riskr_1.0     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.11.6        knitr_1.10.5       magrittr_1.5      
##  [4] MASS_7.3-40        munsell_0.4.2      colorspace_1.2-6  
##  [7] stringr_1.0.0      highr_0.5          caTools_1.17.1    
## [10] tools_3.2.0        parallel_3.2.0     grid_3.2.0        
## [13] gtable_0.1.2       KernSmooth_2.23-14 DBI_0.3.1         
## [16] gtools_3.5.0       htmltools_0.2.6    lazyeval_0.1.10   
## [19] assertthat_0.1     yaml_2.1.13        digest_0.6.8      
## [22] reshape2_1.4.1     formatR_1.2        bitops_1.0-6      
## [25] evaluate_0.7       rmarkdown_0.7      labeling_0.3      
## [28] gdata_2.16.1       stringi_0.5-5      proto_0.3-10