woebin_summary
bins <- woebin(germancredit, y = "creditability")
#> ℹ Creating woe binning ...
#> ✔ Binning on 1000 rows and 21 columns in 00:00:02
binssummary <- woebin_summary(bins)
binssummary
#> # A tibble: 20 × 15
#> variable n_cat…¹ iv ks hhi count…² count…³ has_m…⁴ has_s…⁵
#> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl>
#> 1 status.of.exi… 3 6.39e-1 3.67e- 1 0.333 0.543 0.063 FALSE FALSE
#> 2 credit.history 4 2.92e-1 1.80e- 1 0.25 0.53 0.088 FALSE FALSE
#> 3 duration.in.m… 5 2.83e-1 1.92e- 1 0.2 0.399 0.07 FALSE FALSE
#> 4 savings.accou… 3 1.91e-1 1.87e- 1 0.333 0.603 0.103 FALSE FALSE
#> 5 credit.amount 5 1.81e-1 1.58e- 1 0.2 0.382 0.05 FALSE FALSE
#> 6 purpose 3 1.53e-1 1.79e- 1 0.333 0.608 0.112 FALSE FALSE
#> 7 age.in.years 5 1.30e-1 1.47e- 1 0.2 0.373 0.079 FALSE FALSE
#> 8 property 4 1.13e-1 1.17e- 1 0.25 0.332 0.154 FALSE FALSE
#> 9 present.emplo… 4 8.53e-2 1.20e- 1 0.25 0.339 0.174 FALSE FALSE
#> 10 housing 3 8.33e-2 1.33e- 1 0.333 0.713 0.108 FALSE FALSE
#> 11 other.install… 2 5.76e-2 9.62e- 2 0.5 0.814 0.186 FALSE FALSE
#> 12 installment.r… 3 2.56e-2 7.71e- 2 0.333 0.476 0.157 FALSE FALSE
#> 13 other.debtors… 2 1.64e-2 2.67e- 2 0.5 0.948 0.052 FALSE FALSE
#> 14 number.of.exi… 2 1.01e-2 4.81e- 2 0.5 0.633 0.367 FALSE FALSE
#> 15 personal.stat… 4 8.84e-3 3.33e- 2 0.25 0.548 0.05 FALSE FALSE
#> 16 job 3 8.10e-3 3.14e- 2 0.333 0.63 0.148 FALSE FALSE
#> 17 telephone 2 6.38e-3 3.90e- 2 0.5 0.596 0.404 FALSE FALSE
#> 18 present.resid… 4 3.59e-3 2.24e- 2 0.25 0.413 0.13 FALSE FALSE
#> 19 number.of.peo… 2 4.34e-5 2.38e- 3 0.5 0.845 0.155 FALSE FALSE
#> 20 foreign.worker 1 0 6.51e-17 1 1 1 FALSE FALSE
#> # … with 6 more variables: monotone <lgl>, factor <lgl>, breaks <list>,
#> # iv_lbl <fct>, hhi_lbl <fct>, distribution <chr>, and abbreviated variable
#> # names ¹n_categories, ²count_distr_max, ³count_distr_min, ⁴has_missing,
#> # ⁵has_special_values
binssummary %>%
select(variable, ks, iv, iv_lbl,
hhi, hhi_lbl, distribution)
#> # A tibble: 20 × 7
#> variable ks iv iv_lbl hhi hhi_lbl distr…¹
#> <chr> <dbl> <dbl> <fct> <dbl> <fct> <chr>
#> 1 status.of.existing.checking.ac… 3.67e- 1 6.39e-1 suspi… 0.333 high c… "▇▁▆"
#> 2 credit.history 1.80e- 1 2.92e-1 medium 0.25 modera… "▂▇▁▅"
#> 3 duration.in.month 1.92e- 1 2.83e-1 medium 0.2 modera… "▂▇▇▂▂"
#> 4 savings.account.and.bonds 1.87e- 1 1.91e-1 medium 0.333 high c… "▇▂▃"
#> 5 credit.amount 1.58e- 1 1.81e-1 medium 0.2 modera… "▆▂▇▅▁"
#> 6 purpose 1.79e- 1 1.53e-1 medium 0.333 high c… "▂▃▇"
#> 7 age.in.years 1.47e- 1 1.30e-1 medium 0.2 modera… "▅▂▆▂▇"
#> 8 property 1.17e- 1 1.13e-1 medium 0.25 modera… "▇▆▇▃"
#> 9 present.employment.since 1.20e- 1 8.53e-2 weak 0.25 modera… "▆▇▅▆"
#> 10 housing 1.33e- 1 8.33e-2 weak 0.333 high c… "▂▇▁"
#> 11 other.installment.plans 9.62e- 2 5.76e-2 weak 0.5 high c… "▂▇"
#> 12 installment.rate.in.percentage… 7.71e- 2 2.56e-2 weak 0.333 high c… "▆▂▇"
#> 13 other.debtors.or.guarantors 2.67e- 2 1.64e-2 unpre… 0.5 high c… "▇▁"
#> 14 number.of.existing.credits.at.… 4.81e- 2 1.01e-2 unpre… 0.5 high c… "▇▅"
#> 15 personal.status.and.sex 3.33e- 2 8.84e-3 unpre… 0.25 modera… "▁▅▇▂"
#> 16 job 3.14e- 2 8.10e-3 unpre… 0.333 high c… "▃▇▂"
#> 17 telephone 3.90e- 2 6.38e-3 unpre… 0.5 high c… "▇▆"
#> 18 present.residence.since 2.24e- 2 3.59e-3 unpre… 0.25 modera… "▂▆▃▇"
#> 19 number.of.people.being.liable.… 2.38e- 3 4.34e-5 unpre… 0.5 high c… "▇▂"
#> 20 foreign.worker 6.51e-17 0 unpre… 1 high c… ""
#> # … with abbreviated variable name ¹distribution
woebin_ply_min
The woebin_ply_min
function don’t need a data frame argument.
variable <- head(germancredit$credit.amount, 6)
bin <- bins$credit.amount
bin
#> variable bin count count_distr neg pos posprob woe
#> 1: credit.amount [-Inf,1400) 267 0.267 185 82 0.3071161 0.03366128
#> 2: credit.amount [1400,1800) 105 0.105 87 18 0.1714286 -0.72823850
#> 3: credit.amount [1800,4000) 382 0.382 287 95 0.2486911 -0.25830746
#> 4: credit.amount [4000,9200) 196 0.196 120 76 0.3877551 0.39053946
#> 5: credit.amount [9200, Inf) 50 0.050 21 29 0.5800000 1.17007125
#> bin_iv total_iv breaks is_special_values
#> 1: 0.0003045545 0.1812204 1400 FALSE
#> 2: 0.0468153322 0.1812204 1800 FALSE
#> 3: 0.0241086966 0.1812204 4000 FALSE
#> 4: 0.0319870413 0.1812204 9200 FALSE
#> 5: 0.0780047502 0.1812204 Inf FALSE
woebin_ply_min(variable, bin)
#> ℹ Converting into woe values ...
#> [1] 0.03366128 0.39053946 -0.25830746 0.39053946 0.39053946 0.39053946
woebin_ply_min(variable, bin, value = "posprob")
#> ℹ Converting into woe values ...
#> [1] 0.3071161 0.3877551 0.2486911 0.3877551 0.3877551 0.3877551
woebin_cor_iv
datcor <- woebin_cor_iv(germancredit, bins)
#> ℹ Converting into woe values ...
#> ✔ Woe transformating on 1000 rows and 20 columns in 00:00:00
#> Warning in stats::cor(x = x, y = y, use = use, method = method): the standard
#> deviation is zero
#> Correlation computed with
#> • Method: 'pearson'
#> • Missing treated using: 'pairwise.complete.obs'
datcor
#> # A tibble: 400 × 7
#> var1 var2 r var1_iv var1_…¹ var2_iv var2_…²
#> <fct> <fct> <dbl> <dbl> <int> <dbl> <int>
#> 1 status.of.existing.checking.ac… stat… NA 0.639 1 0.639 1
#> 2 status.of.existing.checking.ac… cred… 0.203 0.639 1 0.292 2
#> 3 status.of.existing.checking.ac… dura… 0.0985 0.639 1 0.283 3
#> 4 status.of.existing.checking.ac… savi… 0.219 0.639 1 0.191 4
#> 5 status.of.existing.checking.ac… cred… 0.0916 0.639 1 0.181 5
#> 6 status.of.existing.checking.ac… purp… 0.133 0.639 1 0.153 6
#> 7 status.of.existing.checking.ac… age.… 0.129 0.639 1 0.130 7
#> 8 status.of.existing.checking.ac… prop… 0.0652 0.639 1 0.113 8
#> 9 status.of.existing.checking.ac… pres… 0.119 0.639 1 0.0853 9
#> 10 status.of.existing.checking.ac… hous… 0.112 0.639 1 0.0833 10
#> # … with 390 more rows, and abbreviated variable names ¹var1_rank, ²var2_rank
cor_limit <- 0.15
datcor %>%
filter(var1 != var2) %>%
mutate(
cor_conflict = ifelse(abs(r) > cor_limit, TRUE, FALSE),
variable_to_remove = ifelse(
cor_conflict,
ifelse(var1 > var2, var2, var1),
NA
)
)
#> Warning in Ops.factor(var1, var2): '>' not meaningful for factors
#> # A tibble: 380 × 9
#> var1 var2 r var1_iv var1_…¹ var2_iv var2_…² cor_c…³ varia…⁴
#> <fct> <fct> <dbl> <dbl> <int> <dbl> <int> <lgl> <lgl>
#> 1 status.of.exist… cred… 0.203 0.639 1 0.292 2 TRUE NA
#> 2 status.of.exist… dura… 0.0985 0.639 1 0.283 3 FALSE NA
#> 3 status.of.exist… savi… 0.219 0.639 1 0.191 4 TRUE NA
#> 4 status.of.exist… cred… 0.0916 0.639 1 0.181 5 FALSE NA
#> 5 status.of.exist… purp… 0.133 0.639 1 0.153 6 FALSE NA
#> 6 status.of.exist… age.… 0.129 0.639 1 0.130 7 FALSE NA
#> 7 status.of.exist… prop… 0.0652 0.639 1 0.113 8 FALSE NA
#> 8 status.of.exist… pres… 0.119 0.639 1 0.0853 9 FALSE NA
#> 9 status.of.exist… hous… 0.112 0.639 1 0.0833 10 FALSE NA
#> 10 status.of.exist… othe… 0.0489 0.639 1 0.0576 11 FALSE NA
#> # … with 370 more rows, and abbreviated variable names ¹var1_rank, ²var2_rank,
#> # ³cor_conflict, ⁴variable_to_remove