The following packages and datasets are used:
library(boot)
library(data.table)
# devtools::install_github('joon3216/funpark')
library(funpark)
library(ggplot2)
library(ggrepel)
library(pROC)
as_tibble <- dplyr::as_tibble
data(femsmoke, package = 'faraway')
data(nodal, package = 'SMPracticals')
1. Introduction
In order to fit a model using logistic regression, we require a binary response. However, if we are given categorical features, such as age group, then data can be structured so that responses are not binary, but rather a count of cases for each combination of features. As an example, consider rich
dataset where:
is_rich
is a response variable having 1 if the person is rich and 0 otherwisegender
is a categorical variable with two categories:M
orF
criminal_record
is also a categorical variable:yes
if the person has one, andno
otherwise
set.seed(1024)
is_rich <- rbinom(10, size = 1, prob = .5)
gender <- rbinom(10, size = 1, prob = .5)
criminal_record <- rbinom(10, size = 1, prob = .5)
# Binary response
(rich <- data.table(
gender = ifelse(gender == 1, 'M', 'F'),
criminal_record = ifelse(criminal_record == 1, 'yes', 'no'),
is_rich = is_rich
))
## gender criminal_record
## 1: F no
## 2: M no
## 3: F no
## 4: F no
## 5: M yes
## 6: F yes
## 7: M yes
## 8: M no
## 9: M no
## 10: F yes
## is_rich
## 1: 0
## 2: 1
## 3: 0
## 4: 0
## 5: 0
## 6: 1
## 7: 0
## 8: 0
## 9: 1
## 10: 1
This rich
dataset has a binary response. However, when you are given a dataset for the first time, it may have a following form:
# Binomial response
(rich_binom <- rich[
order(gender, criminal_record),
.(rich = sum(is_rich), not_rich = sum(!is_rich)),
by = "gender,criminal_record"
])
## gender criminal_record
## 1: F no
## 2: F yes
## 3: M no
## 4: M yes
## rich not_rich
## 1: 0 3
## 2: 2 0
## 3: 2 1
## 4: 0 2
or:
# Count; response variable is now 'count', not 'is_rich'
(rich_pois <- melt(
rich_binom,
id.vars = c('gender', 'criminal_record'),
variable.name = 'is_rich',
value.name = 'count'
))
## gender criminal_record
## 1: F no
## 2: F yes
## 3: M no
## 4: M yes
## 5: F no
## 6: F yes
## 7: M no
## 8: M yes
## is_rich count
## 1: rich 0
## 2: rich 2
## 3: rich 2
## 4: rich 0
## 5: not_rich 3
## 6: not_rich 0
## 7: not_rich 1
## 8: not_rich 2
Forms of rich_binom
and rich_pois
allow us to fit binomial and poisson regression respectively, but not the logistic regression. This document is about transforming the second and third dataset into the first one, i.e. binarizing datasets.
2. Case studies
The first case concern with transformation from a binomial response to a binary response, and the second case is from a count response to a binary response.
Case 1: nodal
nodal
is the dataset in SMPracticals
package where:
m
is just a column of onesr
is an indicator of nodal involvement (1
for yes,0
for no)aged
,stage
,grade
,xray
,acid
are categorical features
See here for more information.
## # A tibble: 53 x 7
## m r aged stage
## <dbl> <dbl> <fct> <fct>
## 1 1 1 0 1
## 2 1 1 0 1
## 3 1 1 0 1
## 4 1 1 0 1
## 5 1 1 0 1
## 6 1 0 0 1
## 7 1 1 0 0
## 8 1 0 0 0
## 9 1 0 0 0
## 10 1 0 0 0
## # ... with 43 more rows, and
## # 3 more variables:
## # grade <fct>, xray <fct>,
## # acid <fct>
We can transform this into nodal_binom
, the data that has binomial responses:
nodal_binom <- nodal[
,
.(m = sum(m), r = sum(r)),
by = c('aged', 'stage', 'grade', 'xray', 'acid')
][
,
c('yes_node', 'no_node', 'm', 'r') := list(r, m - r, NULL, NULL)
][
order(-yes_node, -no_node)
][]
as_tibble(nodal_binom)
## # A tibble: 23 x 7
## aged stage grade xray
## <fct> <fct> <fct> <fct>
## 1 0 1 1 1
## 2 1 1 0 0
## 3 0 1 1 0
## 4 0 0 0 0
## 5 1 1 0 0
## 6 0 1 0 0
## 7 0 0 1 0
## 8 1 1 1 1
## 9 1 1 0 1
## 10 1 0 1 1
## # ... with 13 more rows, and
## # 3 more variables:
## # acid <fct>,
## # yes_node <dbl>,
## # no_node <dbl>
nodal_binom
has a form of rich_binom
, where the last two columns are computed from r
, the binary response in the original nodal
data. yes_node
is a count of 1’s in r
in the original dataset, and no_node
a count of 0’s.
The function binarize_binom
will convert nodal_binom
back to nodal
. dpmf
and rpmf
(in funpark
package) are used to create random names for variable and value columns in order to avoid duplicate names:
binarize_binom <- function(dat, responses, variable.name = NULL) {
# dat: a data whose features are categorical
# responses: a character vector of length 2; each element is the name of
# column that stores the counts of positive and negative
# responses, in this order.
# variable.name: a character; if NULL, then a randomly generated
# character will be used.
# Generate random names to avoid the same names as in features
all_alphabets <- c(
'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f',
'G', 'g', 'H', 'h', 'I', 'i', 'J', 'j', 'K', 'k', 'L', 'l',
'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r',
'S', 's', 'T', 't', 'U', 'u', 'V', 'v', 'W', 'w', 'X', 'x',
'Y', 'y', 'Z', 'z'
)
dalphabet <- function(x){
dpmf(
x,
rep(1, length(all_alphabets)) / length(all_alphabets),
all_alphabets
)
}
separator <-
paste0(rpmf(10, dalphabet, all_alphabets), collapse = '')
united <-
paste0(rpmf(10, dalphabet, all_alphabets), collapse = '')
# Setup
col_names <- colnames(dat)
id_vars <- col_names[!(col_names %in% responses)]
if (!('data.table' %in% class(dat))) {setDT(dat)}
if (is.null(variable.name)) {
variable_name <-
paste0(rpmf(10, dalphabet, all_alphabets), collapse = '')
message('Randomly generated variable name used: ', variable_name)
} else {
variable_name <- variable.name
}
value_name <-
paste0(rpmf(10, dalphabet, all_alphabets), collapse = '')
# Transform into the form that is used in poisson regression
dat <- melt(
dat,
id.vars = id_vars,
variable.name = variable_name,
value.name = value_name
)
id_vars <- c(id_vars, variable_name)
# Binarize
dat <- eval(parse(text = paste0(
'dat[', value_name, ' != 0, ',
'.(', united, ' = do.call(paste, c(.SD, sep = \"', separator,
'\")), ', value_name, '), .SDcols = id_vars]'
)))
dat <- dat[
,
list(
result = rep(
as.data.frame(dat)[, united],
as.data.frame(dat)[, value_name]
)
)
][
, # requires data.table ver >= 1.9.6 because of tstrsplit
c(id_vars) := tstrsplit(result, separator, fixed = T)
][
,
c(id_vars),
with = F
]
dat
}
Let’s see if it works on rich_binom
data:
## gender criminal_record
## 1: F no
## 2: M no
## 3: F no
## 4: F no
## 5: M yes
## 6: F yes
## 7: M yes
## 8: M no
## 9: M no
## 10: F yes
## is_rich
## 1: 0
## 2: 1
## 3: 0
## 4: 0
## 5: 0
## 6: 1
## 7: 0
## 8: 0
## 9: 1
## 10: 1
## gender criminal_record
## 1: F no
## 2: F yes
## 3: M no
## 4: M yes
## rich not_rich
## 1: 0 3
## 2: 2 0
## 3: 2 1
## 4: 0 2
# binarized
(rich_binary <- binarize_binom(
dat = rich_binom,
responses = c('rich', 'not_rich'),
variable.name = 'is_rich'
))
## gender criminal_record
## 1: F yes
## 2: F yes
## 3: M no
## 4: M no
## 5: F no
## 6: F no
## 7: F no
## 8: M no
## 9: M yes
## 10: M yes
## is_rich
## 1: rich
## 2: rich
## 3: rich
## 4: rich
## 5: not_rich
## 6: not_rich
## 7: not_rich
## 8: not_rich
## 9: not_rich
## 10: not_rich
Are rich
and rich_binary
equivalent?
rich_binary[, is_rich := ifelse(is_rich == 'rich', 1L, 0L)]
setequal(rich[order(-is_rich, gender, criminal_record)], rich_binary)
## [1] TRUE
Yes, they are. Likewise, the binarized nodal_binom
is the same as the original nodal
:
## # A tibble: 23 x 7
## aged stage grade xray
## <fct> <fct> <fct> <fct>
## 1 0 1 1 1
## 2 1 1 0 0
## 3 0 1 1 0
## 4 0 0 0 0
## 5 1 1 0 0
## 6 0 1 0 0
## 7 0 0 1 0
## 8 1 1 1 1
## 9 1 1 0 1
## 10 1 0 1 1
## # ... with 13 more rows, and
## # 3 more variables:
## # acid <fct>,
## # yes_node <dbl>,
## # no_node <dbl>
## Randomly generated variable name used: LEXhgicUWS
## # A tibble: 53 x 6
## aged stage grade xray
## <chr> <chr> <chr> <chr>
## 1 0 1 1 1
## 2 0 1 1 1
## 3 0 1 1 1
## 4 0 1 1 1
## 5 0 1 1 1
## 6 1 1 0 0
## 7 1 1 0 0
## 8 0 1 1 0
## 9 0 1 1 0
## 10 0 0 0 0
## # ... with 43 more rows, and
## # 2 more variables:
## # acid <chr>,
## # LEXhgicUWS <chr>
Note that categorical variables are changed from factors into character vectors.
Case 2: femsmoke
femsmoke
data in faraway
package has the following columns:
y
: observed count for given combinationsmoker
,dead
,age
: categorical features
See here for details:
## y smoker dead age
## 1 2 yes yes 18-24
## 2 1 no yes 18-24
## 3 3 yes yes 25-34
## 4 5 no yes 25-34
## 5 14 yes yes 35-44
## 6 7 no yes 35-44
## 7 27 yes yes 45-54
## 8 12 no yes 45-54
## 9 51 yes yes 55-64
## 10 40 no yes 55-64
## 11 29 yes yes 65-74
## 12 101 no yes 65-74
## 13 13 yes yes 75+
## 14 64 no yes 75+
## 15 53 yes no 18-24
## 16 61 no no 18-24
## 17 121 yes no 25-34
## 18 152 no no 25-34
## 19 95 yes no 35-44
## 20 114 no no 35-44
## 21 103 yes no 45-54
## 22 66 no no 45-54
## 23 64 yes no 55-64
## 24 81 no no 55-64
## 25 7 yes no 65-74
## 26 28 no no 65-74
## 27 0 yes no 75+
## 28 0 no no 75+
The task is to populate each (smoker
, dead
, age
) combination according to the counts in y
. For example, in the new binarized dataset, there will be two rows of (smoker = yes
, dead = yes
, age = 18-24
), 64 rows of (smoker = no
, dead = yes
, age = 75+
), and so on.
binarize_pois
is designed to handle datasets designed for poisson regression:
binarize_pois <- function(dat, response) {
# dat: a data whose features are categorical
# response: a character; the name of column that stores the counts
# Generate random names to avoid the same names as in features
all_alphabets <- c(
'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f',
'G', 'g', 'H', 'h', 'I', 'i', 'J', 'j', 'K', 'k', 'L', 'l',
'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r',
'S', 's', 'T', 't', 'U', 'u', 'V', 'v', 'W', 'w', 'X', 'x',
'Y', 'y', 'Z', 'z'
)
dalphabet <- function(x){
dpmf(
x,
rep(1, length(all_alphabets)) / length(all_alphabets),
all_alphabets
)
}
separator <-
paste0(rpmf(10, dalphabet, all_alphabets), collapse = '')
united <-
paste0(rpmf(10, dalphabet, all_alphabets), collapse = '')
# Setup
col_names <- colnames(dat)
id_vars <- col_names[!(col_names %in% response)]
if (!('data.table' %in% class(dat))) {setDT(dat)}
variable_name <-
paste0(rpmf(10, dalphabet, all_alphabets), collapse = '')
value_name <-
paste0(rpmf(10, dalphabet, all_alphabets), collapse = '')
# Binarize
dat <- eval(parse(text = paste0(
'dat[', response, ' != 0, ',
'.(', united, ' = do.call(paste, c(.SD, sep = \"', separator,
'\")), ', response, '), .SDcols = id_vars]'
)))
dat <- dat[
,
list(
result = rep(
eval(parse(text = paste0('dat$\"', united, '\"'))),
eval(parse(text = paste0('dat$\"', response, '\"')))
)
)
][
, # requires data.table ver >= 1.9.6 because of tstrsplit
c(id_vars) := tstrsplit(result, separator, fixed = T)
][
,
c(id_vars),
with = F
]
dat
}
Now, recall rich_pois
:
## gender criminal_record
## 1: F no
## 2: F yes
## 3: M no
## 4: M yes
## 5: F no
## 6: F yes
## 7: M no
## 8: M yes
## is_rich count
## 1: rich 0
## 2: rich 2
## 3: rich 2
## 4: rich 0
## 5: not_rich 3
## 6: not_rich 0
## 7: not_rich 1
## 8: not_rich 2
If we pass rich_pois
into binarize_pois
, we get:
## gender criminal_record
## 1: F yes
## 2: F yes
## 3: M no
## 4: M no
## 5: F no
## 6: F no
## 7: F no
## 8: M no
## 9: M yes
## 10: M yes
## is_rich
## 1: rich
## 2: rich
## 3: rich
## 4: rich
## 5: not_rich
## 6: not_rich
## 7: not_rich
## 8: not_rich
## 9: not_rich
## 10: not_rich
Are rich_binary
and rich_binary2
equivalent?
## [1] TRUE
We check that they are. We can apply binarize_pois
to femsmoke
as follows:
## smoker dead age
## 1: yes yes 18-24
## 2: yes yes 18-24
## 3: no yes 18-24
## 4: yes yes 25-34
## 5: yes yes 25-34
## ---
## 1310: no no 65-74
## 1311: no no 65-74
## 1312: no no 65-74
## 1313: no no 65-74
## 1314: no no 65-74
3. Applications
ROC curve
By having a binary response, we can draw a ROC curve using plot_roc
function:
plot_roc <- function(dat, fmlr) {
# dat: a data with categorical features
# fmlr: a formula
mod <- glm(formula = fmlr, family = binomial, data = dat)
ests <- predict(mod, type = 'response')
response <- as.character(fmlr[2])
actual <- eval(parse(text = paste0('dat$\"', response, '\"')))
roc_result <- roc(actual, ests)
roc_table <- data.table(
TPR = roc_result$sensitivities,
FPR = 1 - roc_result$specificities,
thresholds = roc_result$thresholds
)[
order(TPR)
]
ggplot(roc_table, aes(FPR, TPR, label = round(thresholds, 4))) +
geom_point() +
geom_label_repel(
box.padding = 0.3,
point.padding = 0.3,
segment.color = "grey50"
) +
geom_line() +
geom_segment(
aes(x = 0, y = 0, xend = 1, yend = 1),
col = "red", linetype = "dashed"
) +
annotate(
"text", x = 1, y = .05, hjust = 1,
label = paste0(
"AUC : ", round(as.numeric(auc(roc_result)), 4)
)
) +
labs(
x = "False positive rate",
y = "True positive rate",
title = "ROC curve",
subtitle = paste0("- ", response, " ~ ", as.character(fmlr[3]))
)
}
Here are some examples of ROC curves:
# femsmoke_binary
femsmoke_binary[, dead := ifelse(dead == 'yes', 1, 0)]
plot_roc(femsmoke_binary, dead ~ smoker + age)
# rich_big
is_rich <- rbinom(10000, size = 1, prob = .5)
gender <- rbinom(10000, size = 1, prob = .5)
criminal_record <- rbinom(10000, size = 1, prob = .5)
rich_big <- data.table(
gender = ifelse(gender == 1, 'M', 'F'),
criminal_record = ifelse(criminal_record == 1, 'yes', 'no'),
is_rich = is_rich
)
plot_roc(rich_big, is_rich ~ gender + criminal_record)
Confidence interval for AUC
Using boot::boot
and boot::boot.ci
, we can compute the approximate CI for AUC:
CI_auc <- function(dat, fmlr, R = 500, type = 'norm', ...) {
# dat, fmlr: the same as in roc_curve
# R: the same as R in boot::boot
# type: the same as type in boot::boot.ci
# ...: additional arguments of boot::boot.ci
AUC_boot <- function(dat, i) {
y <- dat[i, ]
mod <- glm(fmlr, family = binomial, data = y)
ests <- predict(mod, type = 'response')
response <- as.character(fmlr[2])
actual <- eval(parse(text = paste0('y$\"', response, '\"')))
suppressMessages(as.numeric(pROC::auc(pROC::roc(actual, ests))))
}
boot_output <- boot::boot(dat, statistic = AUC_boot, R = R)
boot::boot.ci(boot_output, type = type, ...)
}
Examples are as follows:
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 500 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = boot_output, type = type)
##
## Intervals :
## Level Normal
## 95% ( 0.8697, 0.9077 )
## Calculations and Intervals on Original Scale
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 200 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = boot_output, type = type)
##
## Intervals :
## Level Normal
## 95% ( 0.4969, 0.5128 )
## Calculations and Intervals on Original Scale
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 500 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = boot_output, type = type)
##
## Intervals :
## Level Percentile
## 95% ( 0.7728, 0.9735 )
## Calculations and Intervals on Original Scale
4. Aside: change_form
The function change_form
is designed to turn one form of data into another:
change_form <- function(dat, from, to,
old_response, category, new_response) {
# dat: a data with categorical features; a response column/columns
# must be numeric (0 and 1 if from = 'binary', or nonnegative
# integers if from is either 'binomial' or 'poisson').
# from: a character; either 'binary', 'binomial', or 'poisson'
# to: a character != from; either 'binary', 'binomial', or 'poisson'
# old_response: (always specified) a character vector of:
# * length 1 if from = 'binary' or 'poisson'; the name of column
# in dat that stores a response/count
# * lenght 2 if from = 'binomial'; the names of columns in dat
# that store positive and negative case counts, in this order.
# category: (specified only if to = 'poisson') a character vector of:
# * length 1 if from = 'binomial'; the new name of column that
# will store two names in old_response as positive and
# negative cases, in this order.
# * length 2 if from = 'binary'; the new names for positive and
# negative cases in the binary response column, in this order.
# new_response: (specified only if from != 'poisson') a character
# vector of:
# * length 1 if to = 'binary' or 'poisson'; the name of the new
# column in new data that will store either a binary or count
# response
# * length 2 if to = 'binomial'; the names of two columns in
# new data that will store positive and negative case counts,
# in this order.
col_names <- colnames(dat)
id_vars <- col_names[!(col_names %in% old_response)]
id_vars_collapsed <- paste0(id_vars, collapse =',')
if (from == 'binary') {
if (to == 'binomial') {
return(eval(parse(text = paste0(
'dat[, ',
'.(', new_response[1] , ' = sum(', old_response, '), ',
new_response[2], ' = sum(!', old_response, ')), ',
'by = \"', id_vars_collapsed, '\"]'
))))
} else if (to == 'poisson') {
dat <- eval(parse(text = paste0(
'dat[, ',
'.(', category[1] , ' = sum(', old_response, '), ',
category[2], ' = sum(!', old_response, ')), ',
'by = \"', id_vars_collapsed, '\"]'
)))
return(suppressWarnings(melt(
dat,
id.vars = id_vars,
variable.name = old_response,
value.name = new_response
)))
} else {
stop(
'\"to\" must be either \"binomial\" or \"poisson\"',
' if \"from\" = \"binary\"'
)
}
} else if (from == 'binomial') {
if (to == 'binary') {
return(binarize_binom(dat, old_response, new_response))
} else if (to == 'poisson') {
melt(
dat,
id.vars = id_vars,
variable.name = category,
value.name = new_response
)
} else {
stop(
'\"to\" must be either \"binary\" or \"poisson\"',
' if \"from\" = \"binomial\"'
)
}
} else if (from == 'poisson') {
if (to == 'binary') {
return(binarize_pois(dat, old_response))
} else if (to == 'binomial') {
id_vars <- id_vars[!(id_vars %in% category)]
id_vars_fmlr <- paste0(id_vars, collapse = ' + ')
fmlr <- as.formula(paste0(id_vars_fmlr, ' ~ ', category))
return(dcast(dat, fmlr, value.var = old_response))
} else {
stop(
'\"to\" must be either \"binary\" or \"binomial\"',
' if \"from\" = \"poisson\"'
)
}
} else {
stop(paste0(
'\"from\" must be either',
' \"binary\", \"binomial\", or \"poisson\"'
))
}
}
Examples are as follows:
change_form(
rich,
from = 'binary', to = 'binomial',
old_response = 'is_rich',
new_response = c('rich', 'not_rich')
)
## gender criminal_record
## 1: F no
## 2: M no
## 3: M yes
## 4: F yes
## rich not_rich
## 1: 0 3
## 2: 2 1
## 3: 0 2
## 4: 2 0
change_form(
rich,
from = 'binary', to = 'poisson',
old_response = 'is_rich',
category = c('rich', 'not_rich'),
new_response = 'count'
)
## gender criminal_record
## 1: F no
## 2: M no
## 3: M yes
## 4: F yes
## 5: F no
## 6: M no
## 7: M yes
## 8: F yes
## is_rich count
## 1: rich 0
## 2: rich 2
## 3: rich 0
## 4: rich 2
## 5: not_rich 3
## 6: not_rich 1
## 7: not_rich 2
## 8: not_rich 0
nodal <- nodal[, -1]
as_tibble(change_form(
nodal,
from = 'binary', to = 'poisson',
old_response = 'r',
category = c('yes', 'no'),
new_response = 'count'
))
## # A tibble: 46 x 7
## aged stage grade xray
## <fct> <fct> <fct> <fct>
## 1 0 1 1 1
## 2 0 0 0 0
## 3 1 1 1 0
## 4 1 1 0 0
## 5 0 0 0 0
## 6 0 1 1 0
## 7 1 1 0 0
## 8 1 0 0 0
## 9 1 0 0 0
## 10 1 0 0 1
## # ... with 36 more rows, and
## # 3 more variables:
## # acid <fct>, r <fct>,
## # count <dbl>
change_form(
rich_binom,
from = 'binomial', to = 'binary',
old_response = c('rich', 'not_rich'),
new_response = 'is_rich'
)
## gender criminal_record
## 1: F yes
## 2: F yes
## 3: M no
## 4: M no
## 5: F no
## 6: F no
## 7: F no
## 8: M no
## 9: M yes
## 10: M yes
## is_rich
## 1: rich
## 2: rich
## 3: rich
## 4: rich
## 5: not_rich
## 6: not_rich
## 7: not_rich
## 8: not_rich
## 9: not_rich
## 10: not_rich
change_form(
rich_binom,
from = 'binomial', to = 'poisson',
old_response = c('rich', 'not_rich'),
category = 'is_rich',
new_response = 'count'
)
## gender criminal_record
## 1: F no
## 2: F yes
## 3: M no
## 4: M yes
## 5: F no
## 6: F yes
## 7: M no
## 8: M yes
## is_rich count
## 1: rich 0
## 2: rich 2
## 3: rich 2
## 4: rich 0
## 5: not_rich 3
## 6: not_rich 0
## 7: not_rich 1
## 8: not_rich 2
## gender criminal_record
## 1: F yes
## 2: F yes
## 3: M no
## 4: M no
## 5: F no
## 6: F no
## 7: F no
## 8: M no
## 9: M yes
## 10: M yes
## is_rich
## 1: rich
## 2: rich
## 3: rich
## 4: rich
## 5: not_rich
## 6: not_rich
## 7: not_rich
## 8: not_rich
## 9: not_rich
## 10: not_rich
change_form(
rich_pois,
from = 'poisson', to = 'binomial',
old_response = 'count',
category = 'is_rich'
)
## gender criminal_record
## 1: F no
## 2: F yes
## 3: M no
## 4: M yes
## rich not_rich
## 1: 0 3
## 2: 2 0
## 3: 2 1
## 4: 0 2
change_form(
femsmoke,
from = 'poisson', to = 'binomial',
old_response = 'y',
category = 'dead'
) # i.e. yes if dead, no if alive
## smoker age yes no
## 1: yes 18-24 2 53
## 2: yes 25-34 3 121
## 3: yes 35-44 14 95
## 4: yes 45-54 27 103
## 5: yes 55-64 51 64
## 6: yes 65-74 29 7
## 7: yes 75+ 13 0
## 8: no 18-24 1 61
## 9: no 25-34 5 152
## 10: no 35-44 7 114
## 11: no 45-54 12 66
## 12: no 55-64 40 81
## 13: no 65-74 101 28
## 14: no 75+ 64 0
Session info
R session info:
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18362)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_Canada.1252
## [2] LC_CTYPE=English_Canada.1252
## [3] LC_MONETARY=English_Canada.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_Canada.1252
##
## attached base packages:
## [1] stats graphics
## [3] grDevices utils
## [5] datasets methods
## [7] base
##
## other attached packages:
## [1] pROC_1.15.3
## [2] ggrepel_0.8.1
## [3] ggplot2_3.2.1
## [4] funpark_0.2.6
## [5] data.table_1.12.6
## [6] boot_1.3-22
## [7] rmarkdown_1.17
## [8] magrittr_1.5
## [9] itertools2_0.1.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.3
## [2] plyr_1.8.4
## [3] compiler_3.6.1
## [4] pillar_1.4.2
## [5] prettydoc_0.3.1
## [6] iterators_1.0.12
## [7] tools_3.6.1
## [8] zeallot_0.1.0
## [9] digest_0.6.23
## [10] jsonlite_1.6
## [11] evaluate_0.14
## [12] lifecycle_0.1.0
## [13] tibble_2.1.3
## [14] gtable_0.3.0
## [15] pkgconfig_2.0.3
## [16] rlang_0.4.2
## [17] cli_1.1.0
## [18] rstudioapi_0.10
## [19] yaml_2.2.0
## [20] xfun_0.11
## [21] withr_2.1.2
## [22] stringr_1.4.0
## [23] dplyr_0.8.3
## [24] xml2_1.2.2
## [25] knitr_1.26
## [26] vctrs_0.2.0
## [27] tidyselect_0.2.5
## [28] grid_3.6.1
## [29] glue_1.3.1
## [30] R6_2.4.1
## [31] fansi_0.4.0
## [32] farver_2.0.1
## [33] purrr_0.3.3
## [34] backports_1.1.5
## [35] scales_1.1.0
## [36] htmltools_0.4.0
## [37] assertthat_0.2.1
## [38] colorspace_1.4-1
## [39] labeling_0.3
## [40] utf8_1.1.4
## [41] stringi_1.4.3
## [42] lazyeval_0.2.2
## [43] munsell_0.5.0
## [44] crayon_1.3.4