Skip to contents

woebin_summary

bins <- woebin(germancredit, y = "creditability")
#>  Creating woe binning ...
#>  Binning on 1000 rows and 21 columns in 00:00:02

binssummary <- woebin_summary(bins)

binssummary
#> # A tibble: 20 × 15
#>    variable       n_cat…¹      iv       ks   hhi count…² count…³ has_m…⁴ has_s…⁵
#>    <chr>            <int>   <dbl>    <dbl> <dbl>   <dbl>   <dbl> <lgl>   <lgl>  
#>  1 status.of.exi…       3 6.39e-1 3.67e- 1 0.333   0.543   0.063 FALSE   FALSE  
#>  2 credit.history       4 2.92e-1 1.80e- 1 0.25    0.53    0.088 FALSE   FALSE  
#>  3 duration.in.m…       5 2.83e-1 1.92e- 1 0.2     0.399   0.07  FALSE   FALSE  
#>  4 savings.accou…       3 1.91e-1 1.87e- 1 0.333   0.603   0.103 FALSE   FALSE  
#>  5 credit.amount        5 1.81e-1 1.58e- 1 0.2     0.382   0.05  FALSE   FALSE  
#>  6 purpose              3 1.53e-1 1.79e- 1 0.333   0.608   0.112 FALSE   FALSE  
#>  7 age.in.years         5 1.30e-1 1.47e- 1 0.2     0.373   0.079 FALSE   FALSE  
#>  8 property             4 1.13e-1 1.17e- 1 0.25    0.332   0.154 FALSE   FALSE  
#>  9 present.emplo…       4 8.53e-2 1.20e- 1 0.25    0.339   0.174 FALSE   FALSE  
#> 10 housing              3 8.33e-2 1.33e- 1 0.333   0.713   0.108 FALSE   FALSE  
#> 11 other.install…       2 5.76e-2 9.62e- 2 0.5     0.814   0.186 FALSE   FALSE  
#> 12 installment.r…       3 2.56e-2 7.71e- 2 0.333   0.476   0.157 FALSE   FALSE  
#> 13 other.debtors…       2 1.64e-2 2.67e- 2 0.5     0.948   0.052 FALSE   FALSE  
#> 14 number.of.exi…       2 1.01e-2 4.81e- 2 0.5     0.633   0.367 FALSE   FALSE  
#> 15 personal.stat…       4 8.84e-3 3.33e- 2 0.25    0.548   0.05  FALSE   FALSE  
#> 16 job                  3 8.10e-3 3.14e- 2 0.333   0.63    0.148 FALSE   FALSE  
#> 17 telephone            2 6.38e-3 3.90e- 2 0.5     0.596   0.404 FALSE   FALSE  
#> 18 present.resid…       4 3.59e-3 2.24e- 2 0.25    0.413   0.13  FALSE   FALSE  
#> 19 number.of.peo…       2 4.34e-5 2.38e- 3 0.5     0.845   0.155 FALSE   FALSE  
#> 20 foreign.worker       1 0       6.51e-17 1       1       1     FALSE   FALSE  
#> # … with 6 more variables: monotone <lgl>, factor <lgl>, breaks <list>,
#> #   iv_lbl <fct>, hhi_lbl <fct>, distribution <chr>, and abbreviated variable
#> #   names ¹​n_categories, ²​count_distr_max, ³​count_distr_min, ⁴​has_missing,
#> #   ⁵​has_special_values

binssummary %>% 
  select(variable, ks, iv, iv_lbl,
         hhi, hhi_lbl, distribution)
#> # A tibble: 20 × 7
#>    variable                              ks      iv iv_lbl   hhi hhi_lbl distr…¹
#>    <chr>                              <dbl>   <dbl> <fct>  <dbl> <fct>   <chr>  
#>  1 status.of.existing.checking.ac… 3.67e- 1 6.39e-1 suspi… 0.333 high c… "▇▁▆"  
#>  2 credit.history                  1.80e- 1 2.92e-1 medium 0.25  modera… "▂▇▁▅" 
#>  3 duration.in.month               1.92e- 1 2.83e-1 medium 0.2   modera… "▂▇▇▂▂"
#>  4 savings.account.and.bonds       1.87e- 1 1.91e-1 medium 0.333 high c… "▇▂▃"  
#>  5 credit.amount                   1.58e- 1 1.81e-1 medium 0.2   modera… "▆▂▇▅▁"
#>  6 purpose                         1.79e- 1 1.53e-1 medium 0.333 high c… "▂▃▇"  
#>  7 age.in.years                    1.47e- 1 1.30e-1 medium 0.2   modera… "▅▂▆▂▇"
#>  8 property                        1.17e- 1 1.13e-1 medium 0.25  modera… "▇▆▇▃" 
#>  9 present.employment.since        1.20e- 1 8.53e-2 weak   0.25  modera… "▆▇▅▆" 
#> 10 housing                         1.33e- 1 8.33e-2 weak   0.333 high c… "▂▇▁"  
#> 11 other.installment.plans         9.62e- 2 5.76e-2 weak   0.5   high c… "▂▇"   
#> 12 installment.rate.in.percentage… 7.71e- 2 2.56e-2 weak   0.333 high c… "▆▂▇"  
#> 13 other.debtors.or.guarantors     2.67e- 2 1.64e-2 unpre… 0.5   high c… "▇▁"   
#> 14 number.of.existing.credits.at.… 4.81e- 2 1.01e-2 unpre… 0.5   high c… "▇▅"   
#> 15 personal.status.and.sex         3.33e- 2 8.84e-3 unpre… 0.25  modera… "▁▅▇▂" 
#> 16 job                             3.14e- 2 8.10e-3 unpre… 0.333 high c… "▃▇▂"  
#> 17 telephone                       3.90e- 2 6.38e-3 unpre… 0.5   high c… "▇▆"   
#> 18 present.residence.since         2.24e- 2 3.59e-3 unpre… 0.25  modera… "▂▆▃▇" 
#> 19 number.of.people.being.liable.… 2.38e- 3 4.34e-5 unpre… 0.5   high c… "▇▂"   
#> 20 foreign.worker                  6.51e-17 0       unpre… 1     high c… ""     
#> # … with abbreviated variable name ¹​distribution

woebin_ply_min

The woebin_ply_min function don’t need a data frame argument.

variable <- head(germancredit$credit.amount, 6)

bin <- bins$credit.amount

bin
#>         variable         bin count count_distr neg pos   posprob         woe
#> 1: credit.amount [-Inf,1400)   267       0.267 185  82 0.3071161  0.03366128
#> 2: credit.amount [1400,1800)   105       0.105  87  18 0.1714286 -0.72823850
#> 3: credit.amount [1800,4000)   382       0.382 287  95 0.2486911 -0.25830746
#> 4: credit.amount [4000,9200)   196       0.196 120  76 0.3877551  0.39053946
#> 5: credit.amount [9200, Inf)    50       0.050  21  29 0.5800000  1.17007125
#>          bin_iv  total_iv breaks is_special_values
#> 1: 0.0003045545 0.1812204   1400             FALSE
#> 2: 0.0468153322 0.1812204   1800             FALSE
#> 3: 0.0241086966 0.1812204   4000             FALSE
#> 4: 0.0319870413 0.1812204   9200             FALSE
#> 5: 0.0780047502 0.1812204    Inf             FALSE

woebin_ply_min(variable, bin)
#>  Converting into woe values ...
#> [1]  0.03366128  0.39053946 -0.25830746  0.39053946  0.39053946  0.39053946

woebin_ply_min(variable, bin, value = "posprob")
#>  Converting into woe values ...
#> [1] 0.3071161 0.3877551 0.2486911 0.3877551 0.3877551 0.3877551

woebin_cor_iv

datcor <- woebin_cor_iv(germancredit, bins)
#>  Converting into woe values ...
#>  Woe transformating on 1000 rows and 20 columns in 00:00:00
#> Warning in stats::cor(x = x, y = y, use = use, method = method): the standard
#> deviation is zero
#> Correlation computed with
#>  Method: 'pearson'
#>  Missing treated using: 'pairwise.complete.obs'

datcor
#> # A tibble: 400 × 7
#>    var1                            var2        r var1_iv var1_…¹ var2_iv var2_…²
#>    <fct>                           <fct>   <dbl>   <dbl>   <int>   <dbl>   <int>
#>  1 status.of.existing.checking.ac… stat… NA        0.639       1  0.639        1
#>  2 status.of.existing.checking.ac… cred…  0.203    0.639       1  0.292        2
#>  3 status.of.existing.checking.ac… dura…  0.0985   0.639       1  0.283        3
#>  4 status.of.existing.checking.ac… savi…  0.219    0.639       1  0.191        4
#>  5 status.of.existing.checking.ac… cred…  0.0916   0.639       1  0.181        5
#>  6 status.of.existing.checking.ac… purp…  0.133    0.639       1  0.153        6
#>  7 status.of.existing.checking.ac… age.…  0.129    0.639       1  0.130        7
#>  8 status.of.existing.checking.ac… prop…  0.0652   0.639       1  0.113        8
#>  9 status.of.existing.checking.ac… pres…  0.119    0.639       1  0.0853       9
#> 10 status.of.existing.checking.ac… hous…  0.112    0.639       1  0.0833      10
#> # … with 390 more rows, and abbreviated variable names ¹​var1_rank, ²​var2_rank

cor_limit <- 0.15

datcor %>%
  filter(var1 != var2) %>%
  mutate(
    cor_conflict = ifelse(abs(r) > cor_limit, TRUE, FALSE),
    variable_to_remove = ifelse(
      cor_conflict,
      ifelse(var1 > var2, var2, var1),
      NA
    )
  )
#> Warning in Ops.factor(var1, var2): '>' not meaningful for factors
#> # A tibble: 380 × 9
#>    var1             var2       r var1_iv var1_…¹ var2_iv var2_…² cor_c…³ varia…⁴
#>    <fct>            <fct>  <dbl>   <dbl>   <int>   <dbl>   <int> <lgl>   <lgl>  
#>  1 status.of.exist… cred… 0.203    0.639       1  0.292        2 TRUE    NA     
#>  2 status.of.exist… dura… 0.0985   0.639       1  0.283        3 FALSE   NA     
#>  3 status.of.exist… savi… 0.219    0.639       1  0.191        4 TRUE    NA     
#>  4 status.of.exist… cred… 0.0916   0.639       1  0.181        5 FALSE   NA     
#>  5 status.of.exist… purp… 0.133    0.639       1  0.153        6 FALSE   NA     
#>  6 status.of.exist… age.… 0.129    0.639       1  0.130        7 FALSE   NA     
#>  7 status.of.exist… prop… 0.0652   0.639       1  0.113        8 FALSE   NA     
#>  8 status.of.exist… pres… 0.119    0.639       1  0.0853       9 FALSE   NA     
#>  9 status.of.exist… hous… 0.112    0.639       1  0.0833      10 FALSE   NA     
#> 10 status.of.exist… othe… 0.0489   0.639       1  0.0576      11 FALSE   NA     
#> # … with 370 more rows, and abbreviated variable names ¹​var1_rank, ²​var2_rank,
#> #   ³​cor_conflict, ⁴​variable_to_remove