3 Modeling and model comparison
As noted in the EDA section:
The relationship between \(log_{10}(pop\_density)\) and \(log_{10}(light\_avg)\) in figures 2.4 and 2.5) isn’t quite linear, and there are differences by state. So I expect simple linear models to perform worse than more sophisticated ones. Nonetheless, I want to see how good linear models are, since they are simple and inexpensive.
In this section I do the following:
- Prepare train-test and holdout data sets
- Build (and where necessary, train) some models, then compare them using performance metrics
- Test the best performing models against the holdout data set.
I use the following performance metrics:
- Concordance correlation coefficient (ccc) according the help for
yardstick::ccc()
“is a metric of both consistency/correlation and accuracy,” - R-Squared. I use
yardstick::rsq()
which according to the documentation “is simply the squared correlation between truth and estimate … [that] guarantees a value on \((0,1)\).”
I use metrics that are scale-free, since I want the freedom to use models with different transformations of the data. The principle metric is ccc.
# For 2000
<- tar_read(all_df) %>%
data_df #fix NAs
mutate(atotal = if_else(!is.na(atotal), atotal, calc_area),
pop_density_official = if_else(!is.na(pop_density_official), pop_density_official, pop_density)) %>%
# drop a small number of data errors/artifacts
filter(light_avg > 1e-5)
# prepare data for train-test and holdout
<- data_df %>%
d_df # dplyr::select(state:pop_density_official) %>%
# avoid zero values
mutate(log_light_avg = log10(light_avg + 1e-6),
log_pop_density = log10(pop_density + 1e-6),
log_pop_density_official = log10(pop_density_official + 1e-6)) %>%
select(state, county, light_avg, log_light_avg, pop_density, log_pop_density, pop_density_official, log_pop_density_official)
# following https://cran.r-project.org/web/packages/rsample/vignettes/Working_with_rsets.html
set.seed(123)
= initial_split(d_df, strata = c("state"))
main_split <- analysis(main_split)
train_split <- assessment(main_split)
holdout
<- vfold_cv(train_split, v = n_folds, repeats = 1) light_folds
# For 2010
<- tar_read(all_df_c) %>%
data_df_c #fix NAs
mutate(atotal = if_else(!is.na(atotal), atotal, calc_area),
pop_density_official = if_else(!is.na(pop_density_official), pop_density_official, pop_density)) %>%
# drop a small number of data errors/artifacts
filter(light_avg > 1e-5)
# prepare data for train-test and holdout
<- data_df_c %>%
assess_df # dplyr::select(state:pop_density_official) %>%
# avoid zero values
mutate(log_light_avg = log10(light_avg + 1e-6),
log_pop_density = log10(pop_density + 1e-6),
log_pop_density_official = log10(pop_density_official + 1e-6)) %>%
select(state, county, light_avg, log_light_avg, pop_density, log_pop_density, pop_density_official, log_pop_density_official)
set.seed(2021)
# inspiration/source: Dave Robinson's coding videos; see https://github.com/dgrtwo/data-screencasts
<- function(x) juice(prep(x))
prep_juice
<- metric_set(rsq, ccc) # use scale-free metrics (not rmse)
mset
<- control_grid(save_pred = TRUE,
grid_control save_workflow = TRUE,
extract = extract_model)
<- function(x, newdata, ...) {
augment.workflow predict(x, newdata, ...) %>%
bind_cols(newdata)
}
<- function(workflow, tbl_train, tbl_test) {
predict_on_holdout %>%
workflow fit(tbl_train) %>%
augment(tbl_test)
}
# for use in fit_resamples()
<- control_resamples(save_pred = TRUE, save_workflow = TRUE) keep_pred
3.1 Train and test on training data set
3.1.1 lin1: pop_density ~ light_avg
Let’s start simple. In lin1 the data is concentrated in smaller radiance values, and the model is depressing the slope of the line due to outliers at high pop_density.
<-
lm_model linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
<- "pop_density ~ light_avg"
my_formula
<- recipe(
lin_rec data = analysis(light_folds$splits[[1]]),
formula = as.formula(my_formula))
<-
lin1_wf workflow() %>%
add_recipe(lin_rec) %>%
add_model(lm_model)
<- lin1_wf %>%
lin1_res fit_resamples(resamples = light_folds,
metrics = mset,
control = keep_pred)
<- lin1_res %>%
lin1_metric_summary collect_metrics() %>%
mutate(model = glue("lin1: {my_formula}")) %>%
::select(".metric", .estimate = "mean", "n", "std_err", "model")
dplyr
<- glue("{lin1_metric_summary[[1, 1]]} = {round(lin1_metric_summary[[1, 2]], 3)}",
metric_summary_for_plot "; {lin1_metric_summary[[2, 1]]} = {round(lin1_metric_summary[[2, 2]], 3)}")
%>%
lin1_res collect_predictions() %>%
ggplot(aes(x = pop_density, y = .pred)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
labs(title = glue("{lin1_metric_summary$model}"),
subtitle = glue("Actual and predicted pop_density: {metric_summary_for_plot}")
)
3.1.2 lin2: pop_density_official ~ light_avg
Should I use official area values from the US Census Bureau to calculate population density instead of area values calculated based on the county boundaries? In a word: no. Using official area values, rsq and ccc are slightly lower than lin 1 (figure 3.1), perhaps because pop_density
is calculated using the same boundaries I’m using to determine which light pixels are part of each county, and therefore I’m dealing with similar error in both. I use the area values I calculated in all other models.
<- "pop_density_official ~ light_avg"
my_formula
<- recipe(
lin_rec2 data = analysis(light_folds$splits[[1]]),
formula = as.formula(my_formula))
<-
lin2_wf workflow() %>%
add_recipe(lin_rec2) %>%
add_model(lm_model) # defined above
<- lin2_wf %>%
lin2_res fit_resamples(resamples = light_folds,
metrics = mset,
control = keep_pred)
<- lin2_res %>%
lin2_metric_summary collect_metrics() %>%
mutate(model = glue("lin2: {my_formula}")) %>%
::select(".metric", .estimate = "mean", "n", "std_err", "model")
dplyr
<- glue("{lin2_metric_summary[[1, 1]]} = {round(lin2_metric_summary[[1, 2]], 3)}",
metric_summary_for_plot "; {lin2_metric_summary[[2, 1]]} = {round(lin2_metric_summary[[2, 2]], 3)}")
%>%
lin2_res collect_predictions() %>%
ggplot(aes(x = pop_density_official, y = .pred)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
labs(title = glue("{lin2_metric_summary$model}"),
subtitle = glue("Actual and predicted pop_density_official: {metric_summary_for_plot}")
)
3.1.3 lin3: log_pop_density ~ log_light_avg
With both predictor and response on a log scale, performance metrics are better. Compare with lin1 in Figure 3.1.
<- "log_pop_density ~ log_light_avg"
my_formula
<- recipe(
lin_rec3 data = analysis(light_folds$splits[[1]]),
formula = as.formula(my_formula))
<-
lin3_wf workflow() %>%
add_recipe(lin_rec3) %>%
add_model(lm_model) # defined above
<- lin3_wf %>%
lin3_res fit_resamples(resamples = light_folds,
metrics = mset,
control = keep_pred)
<- lin3_res %>%
lin3_metric_summary collect_metrics() %>%
mutate(model = glue("lin3: {my_formula}")) %>%
::select(".metric", .estimate = "mean", "n", "std_err", "model")
dplyr
<- glue("{lin3_metric_summary[[1, 1]]} = {round(lin3_metric_summary[[1, 2]], 3)}",
metric_summary_for_plot "; {lin3_metric_summary[[2, 1]]} = {round(lin3_metric_summary[[2, 2]], 3)}")
%>%
lin3_res collect_predictions() %>%
ggplot(aes(x = log_pop_density, y = .pred)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
# coord_obs_pred() +
labs(title = glue("{lin3_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
)
3.1.4 pol1: log_light_avg ~ poly(log_pop_density, 2)
Since the data in Figure 2.4 looks like it has close to a quadratic relationship (a convex curve), let’s try a second order polynomial. Performance is quite good.
<- "log_pop_density ~ log_light_avg"
my_formula_basic <- "log_pop_density ~ poly(log_light_avg, 2)"
my_formula
<- recipe(
pol1_rec data = analysis(light_folds$splits[[1]]),
formula = as.formula(my_formula_basic)) %>%
step_poly(log_light_avg, degree = 2)
<-
pol1_wf workflow() %>%
add_recipe(pol1_rec) %>%
add_model(lm_model) # defined above
<- pol1_wf %>%
pol1_res fit_resamples(resamples = light_folds,
metrics = mset,
control = keep_pred)
<- pol1_res %>%
pol1_metric_summary collect_metrics() %>%
mutate(model = glue("pol1: {my_formula}")) %>%
::select(".metric", .estimate = "mean", "n", "std_err", "model")
dplyr
<- glue("{pol1_metric_summary[[1, 1]]} = {round(pol1_metric_summary[[1, 2]], 3)}",
metric_summary_for_plot "; {pol1_metric_summary[[2, 1]]} = {round(pol1_metric_summary[[2, 2]], 3)}")
%>%
pol1_res collect_predictions() %>%
ggplot(aes(x = log_pop_density, y = .pred)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
labs(title = glue("{pol1_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
)
3.1.5 lme1: multi-level model with lme4::lmer
Since states differ in the range and distribution of radiance values, are there improvements if I use a fixed effects model “log_pop_density ~ log_light_avg + (log_light_avg | state)?” Unfortunately not. There is no meaningful improvement over Figure 3.3. Apparently state
doesn’t hold enough unique information to be helpful.
<-
lme_model linear_reg() %>%
set_engine("lmer") %>%
set_mode("regression")
<- "log_pop_density ~ log_light_avg + (log_light_avg | state)"
my_formula
<-
lme1_wf workflow() %>%
add_variables(outcomes = log_pop_density,
predictors = c(log_light_avg, state)
%>%
) add_model(
lme_model,formula = as.formula(my_formula)
)
<- lme1_wf %>%
lme1_res fit_resamples(resamples = light_folds,
metrics = mset,
control = keep_pred)
<- lme1_res %>%
lme1_metric_summary collect_metrics() %>%
mutate(model = glue("lme1: {my_formula}")) %>%
::select(".metric", .estimate = "mean", "n", "std_err", "model")
dplyr
<- glue("{lme1_metric_summary[[1, 1]]} = {round(lme1_metric_summary[[1, 2]], 3)}",
metric_summary_for_plot "; {lme1_metric_summary[[2, 1]]} = {round(lme1_metric_summary[[2, 2]], 3)}")
%>%
lme1_res collect_predictions() %>%
ggplot(aes(x = log_pop_density, y = .pred)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
theme(plot.title = element_text(size = 14)) +
labs(title = glue("{lme1_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
)
3.1.6 bs1: linear model with b-spline
Since lin3 (figure 3.3) seems to be overestimating high and low radiance, let’s try a spline. First I tune the model to determine how many knots to include in the spline:
<- recipe(log_pop_density ~ log_light_avg,
bs_rec data = analysis(light_folds$splits[[1]])) %>%
step_ns(log_light_avg, deg_free = tune("log_light_avg"))
<-
bs_model linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
= "log_pop_density ~ bs(log_light_avg)"
my_formula
<-
bs1_wf workflow() %>%
add_recipe(bs_rec) %>%
add_model(bs_model)
<- tibble(log_light_avg = 1:6)
k_grid
<- bs1_wf %>%
bs_tune tune_grid(resamples = light_folds,
grid = k_grid,
metrics = mset,
control = grid_control)
autoplot(bs_tune) +
labs(title = "Tuning bs1")
I’ll use 2 knots, which maximizes both ccc and rsq.
<- bs1_wf %>%
bs_wf_best finalize_workflow(select_best(bs_tune, metric = "ccc"))
select_best(bs_tune, metric = "ccc") %>%
gt()
log_light_avg | .config |
---|---|
2 | Preprocessor2_Model1 |
Both ccc and rsq are slightly better than lin3, and the high and low predictions are not consistently overestimated:
<- bs_wf_best %>%
bs1_res fit_resamples(resamples = light_folds,
metrics = mset,
control = keep_pred)
<- bs1_res %>% collect_metrics() %>%
bs1_metric_summary mutate(model = glue("bs1: {my_formula}")) %>%
::select(".metric", .estimate = "mean", "n", "std_err", "model")
dplyr
<- glue("{bs1_metric_summary[[1, 1]]} = {round(bs1_metric_summary[[1, 2]], 3)}",
metric_summary_for_plot "; {bs1_metric_summary[[2, 1]]} = {round(bs1_metric_summary[[2, 2]], 3)}")
%>%
bs1_res collect_predictions() %>%
ggplot(aes(x = log_pop_density, y = .pred)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
theme(plot.title = element_text(size = 14)) +
labs(title = glue("{bs1_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
)
3.1.7 XGBoost: log_pop_density ~ log_light_avg + state
Now a non-linear model and one that needs training: boosted trees using XGBoost. In this model state
categories are dummy variables.
set.seed(2021)
<- recipe(log_pop_density ~ log_light_avg + state,
xg1_rec data = analysis(light_folds$splits[[1]])) %>%
step_dummy(state)
<- workflow() %>%
xg1_wf add_recipe(xg1_rec) %>%
add_model(boost_tree(mode = "regression",
mtry = tune(),
trees = tune(),
# tree_depth = , # default is 6
learn_rate = .01) %>% set_engine("xgboost"))
<- xg1_wf %>%
xg1_tune tune_grid(light_folds,
grid = crossing(mtry = c(2, 5, 8, 11),
trees = seq(50, 400, 50),
#threshold = c(0.01) # default is 0.01
),metrics = mset,
control = grid_control)
Tuning the model: results suggest using between 250 and 400 trees with nearly equal performance including 5, 8 or 11 variables at each split:
autoplot(xg1_tune) +
labs(title = "Tuning xg1")
The best fit includes 400 trees and using \(mtry = 8\) to randomly sample 8 variables at each split (which is nearly all of them in this case). The benefit of 400 trees versus 300 is very small, and given we have ~650 data points for training (before considering that we are cross-validating with 10 folds), I use the lesser number. Fewer trees reduces the chance of overfitting.
<- xg1_wf %>%
xg1_wf_best finalize_workflow(select_best(xg1_tune, metric = "ccc")) %>%
update_model(boost_tree(mode = "regression",
mtry = 8,
trees = 300,
# tree_depth = 3,
learn_rate = .01) %>% set_engine("xgboost"))
So I use these:
print(xg1_wf_best)
## ══ Workflow ═════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ─────────────────────────────────────────────────────────────────
## 1 Recipe Step
##
## • step_dummy()
##
## ── Model ────────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## mtry = 8
## trees = 300
## learn_rate = 0.01
##
## Computational engine: xgboost
Summary metrics are good, however the model is underestimating most radiance values:
<- "log_pop_density ~ log_light_avg + state"
my_formula
<- xg1_wf_best %>%
xg1_fit_best fit(train_split)
<- xg1_fit_best %>%
xg1_res last_fit(main_split, metrics = mset)
<- xg1_res %>%
xg1_metric_summary collect_metrics() %>%
mutate(model = glue("xg1: {my_formula}")) %>%
mutate(n = n_folds,
std_err = NA_real_) %>%
::select(.metric, .estimate, n, std_err, model)
dplyr
<- glue("{xg1_metric_summary[[2, 1]]} = {round(xg1_metric_summary[[2, 2]], 3)}",
metric_summary_for_plot "; {xg1_metric_summary[[1, 1]]} = {round(xg1_metric_summary[[1, 2]], 3)}")
%>%
xg1_res collect_predictions() %>%
ggplot(aes(x = log_pop_density, y = .pred)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
theme(plot.title = element_text(size = 14)) +
labs(title = glue("{xg1_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
)
3.1.8 Summary of train-test results
As expected, the highest-performing model is not the simplest. lin1 and lin2 are the two worst-performing models; I exclude them from the next stage. The others (simpler and more complex) perform similarly.
<- bind_rows(
all_metrics
lin1_metric_summary,
lin2_metric_summary,
lin3_metric_summary,
pol1_metric_summary,
xg1_metric_summary,
lme1_metric_summary,
bs1_metric_summary,NULL
%>%
) filter(.metric %in% c("ccc", "rsq"))
= all_metrics %>%
model_levels filter(.metric == "ccc") %>%
arrange(.estimate) %>%
pull(model)
%>%
all_metrics mutate(model = factor(model, levels = model_levels)) %>%
ggplot(aes(.estimate, model)) +
geom_errorbarh(aes(xmax = .estimate + std_err, xmin = .estimate - std_err),
height = 0.2) +
geom_point() +
expand_limits(x = 1.0) +
facet_wrap( ~ .metric, nrow = 1) +
theme(plot.title.position = "plot") +
labs(title = glue("Comparing model performance for metrics ",
"{glue_collapse(sort(unique(all_metrics$.metric)), sep = ', ')}"),
subtitle = "Performance on train-test set",
x = "mean metric estimate across all folds; error bars +/- std_err")
%>%
all_metrics arrange(.metric, desc(.estimate)) %>%
gt() %>%
tab_header(
title = md("**Model evaluation on train-test data set**")
%>%
) fmt_number(columns = c(.estimate, std_err),
decimals = 3) %>%
fmt_missing(columns = everything(),
missing_text = "---") %>%
cols_align(columns = model, align = "left")
Model evaluation on train-test data set | ||||
---|---|---|---|---|
.metric | .estimate | n | std_err | model |
ccc | 0.966 | 10 | — | xg1: log_pop_density ~ log_light_avg + state |
ccc | 0.953 | 10 | 0.004 | bs1: log_pop_density ~ bs(log_light_avg) |
ccc | 0.953 | 10 | 0.004 | pol1: log_pop_density ~ poly(log_light_avg, 2) |
ccc | 0.948 | 10 | 0.004 | lme1: log_pop_density ~ log_light_avg + (log_light_avg | state) |
ccc | 0.947 | 10 | 0.005 | lin3: log_pop_density ~ log_light_avg |
ccc | 0.917 | 10 | 0.009 | lin1: pop_density ~ light_avg |
ccc | 0.893 | 10 | 0.024 | lin2: pop_density_official ~ light_avg |
rsq | 0.955 | 10 | — | xg1: log_pop_density ~ log_light_avg + state |
rsq | 0.942 | 10 | 0.008 | bs1: log_pop_density ~ bs(log_light_avg) |
rsq | 0.942 | 10 | 0.008 | pol1: log_pop_density ~ poly(log_light_avg, 2) |
rsq | 0.931 | 10 | 0.008 | lin3: log_pop_density ~ log_light_avg |
rsq | 0.931 | 10 | 0.008 | lme1: log_pop_density ~ log_light_avg + (log_light_avg | state) |
rsq | 0.931 | 10 | 0.020 | lin1: pop_density ~ light_avg |
rsq | 0.922 | 10 | 0.022 | lin2: pop_density_official ~ light_avg |
3.2 Assessing model performance with the holdout data set
In this section I use data the models haven’t seen yet from 2000 (the holdout data set).
lin3 summary metrics are quite good, however again the curvature in the data is visible in the way the model is overestimating high and low radiance values:
<- lin3_wf %>%
lin3_pred predict_on_holdout(., train_split, holdout)
<- bind_rows(
lin3_metric_summary rsq(lin3_pred, truth = log_pop_density, estimate = .pred),
ccc(lin3_pred, truth = log_pop_density, estimate = .pred)
%>%
) mutate(model = unique(lin3_metric_summary$model))
%>%
lin3_pred ggplot(aes(log_pop_density, .pred, color = state)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
coord_fixed() +
labs(title = glue("{lin3_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: ",
"{lin3_metric_summary[[2, 1]]} = {round(lin3_metric_summary[[2, 3]], 3)}",
"; {lin3_metric_summary[[1, 1]]} = {round(lin3_metric_summary[[1, 3]], 3)}")
)
pol1 looks good but is slightly overestimating high radiance values:
<- pol1_wf %>%
pol1_pred predict_on_holdout(., train_split, holdout)
<- bind_rows(
pol1_metric_summary rsq(lin3_pred, truth = log_pop_density, estimate = .pred),
ccc(lin3_pred, truth = log_pop_density, estimate = .pred)
%>%
) mutate(model = unique(pol1_metric_summary$model))
%>%
pol1_pred ggplot(aes(log_pop_density, .pred, color = state)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
coord_fixed() +
labs(title = glue("{pol1_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: ",
"{pol1_metric_summary[[2, 1]]} = {round(pol1_metric_summary[[2, 3]], 3)}",
"; {pol1_metric_summary[[1, 1]]} = {round(pol1_metric_summary[[1, 3]], 3)}")
)
Again lme1 fails to do better than lin3:
# error=TRUE to continue execution in case model fails to converge
<- lme1_wf %>%
lme1_pred predict_on_holdout(., train_split, holdout)
<- bind_rows(
lme1_metric_summary rsq(lme1_pred, truth = log_pop_density, estimate = .pred),
ccc(lme1_pred, truth = log_pop_density, estimate = .pred)
%>%
) mutate(model = unique(lme1_metric_summary$model))
%>%
lme1_pred ggplot(aes(log_pop_density, .pred, color = state)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
coord_fixed() +
labs(title = glue("{lme1_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: ",
"{lme1_metric_summary[[2, 1]]} = {round(lme1_metric_summary[[2, 3]], 3)}",
"; {lme1_metric_summary[[1, 1]]} = {round(lme1_metric_summary[[1, 3]], 3)}")
)
Again bs1 is performing quite well:
<- bs_wf_best %>%
bs1_pred predict_on_holdout(., train_split, holdout)
<- bind_rows(
bs1_metric_summary rsq(bs1_pred, truth = log_pop_density, estimate = .pred),
ccc(bs1_pred, truth = log_pop_density, estimate = .pred)
%>%
) mutate(model = unique(bs1_metric_summary$model))
%>%
bs1_pred ggplot(aes(log_pop_density, .pred, color = state)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
coord_fixed() +
labs(title = glue("{bs1_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: ",
"{bs1_metric_summary[[2, 1]]} = {round(bs1_metric_summary[[2, 3]], 3)}",
"; {bs1_metric_summary[[1, 1]]} = {round(bs1_metric_summary[[1, 3]], 3)}")
)
xg1 performance metrics are good, but the model is again systematically underestimating:
<- xg1_fit_best %>%
xg1_pred predict_on_holdout(., train_split, holdout)
<- bind_rows(
xg1_metric_summary rsq(xg1_pred, truth = log_pop_density, estimate = .pred),
ccc(xg1_pred, truth = log_pop_density, estimate = .pred)
%>%
) mutate(model = unique(xg1_metric_summary$model))
%>%
xg1_pred ggplot(aes(log_pop_density, .pred, color = state)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
coord_fixed() +
labs(title = glue("{xg1_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: ",
"{xg1_metric_summary[[2, 1]]} = {round(xg1_metric_summary[[2, 3]], 3)}",
"; {xg1_metric_summary[[1, 1]]} = {round(xg1_metric_summary[[1, 3]], 3)}")
)
3.2.1 Summary: model evaluation using holdout data set
Results are similar for all models. The very simple lin3 is very competitive. I exclude lme1 from the next stage.
<- bind_rows(
all_metrics
lin3_metric_summary,
pol1_metric_summary,
lme1_metric_summary,
bs1_metric_summary,
xg1_metric_summary
)
= all_metrics %>%
model_levels filter(.metric == "ccc") %>%
arrange(.estimate) %>%
pull(model)
%>%
all_metrics mutate(model = factor(model, levels = model_levels)) %>%
ggplot(aes(.estimate, model)) +
geom_point() +
geom_label(aes(label = str_pad(round(.estimate, 3),
width = 5, side = "right", pad = "0")
),size = 3, nudge_x = -0.1, hjust = 1) +
expand_limits(x = 0) +
facet_wrap(~ .metric, scales = "free_x") +
theme(legend.position = "none",
plot.title.position = "plot") +
labs(title = "Model evaluation on holdout data set",
x = "metric estimate",
y = "")
%>%
all_metrics ::select(-.estimator) %>%
dplyrarrange(.metric, desc(.estimate)) %>%
gt() %>%
tab_header(
title = md("**Model evaluation on holdout data set**")
%>%
) fmt_number(columns = .estimate,
decimals = 3)
Model evaluation on holdout data set | ||
---|---|---|
.metric | .estimate | model |
ccc | 0.972 | bs1: log_pop_density ~ bs(log_light_avg) |
ccc | 0.966 | xg1: log_pop_density ~ log_light_avg + state |
ccc | 0.963 | lin3: log_pop_density ~ log_light_avg |
ccc | 0.963 | pol1: log_pop_density ~ poly(log_light_avg, 2) |
ccc | 0.963 | lme1: log_pop_density ~ log_light_avg + (log_light_avg | state) |
rsq | 0.955 | xg1: log_pop_density ~ log_light_avg + state |
rsq | 0.954 | bs1: log_pop_density ~ bs(log_light_avg) |
rsq | 0.937 | lme1: log_pop_density ~ log_light_avg + (log_light_avg | state) |
rsq | 0.937 | lin3: log_pop_density ~ log_light_avg |
rsq | 0.937 | pol1: log_pop_density ~ poly(log_light_avg, 2) |
3.3 Comparing 2000 and 2010
I use the four remaining models to predict changes in log_pop_density in 2010 based on average county radiance log_light_avg in the same year, comparing performance on the holdout set with the 2010 assessment set.
lin3 looks pretty good, but the model overestimates high 2010 log_pop_density values.
<- lin3_wf %>%
lin3_pred predict_on_holdout(., holdout, assess_df)
<- bind_rows(
lin3_metric_summary rsq(lin3_pred, truth = log_pop_density, estimate = .pred),
ccc(lin3_pred, truth = log_pop_density, estimate = .pred)
%>%
) mutate(model = unique(lin3_metric_summary$model))
%>%
lin3_pred ggplot(aes(log_pop_density, .pred, color = state)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
coord_fixed() +
labs(title = glue("{lin3_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: ",
"{lin3_metric_summary[[2, 1]]} = {round(lin3_metric_summary[[2, 3]], 3)}",
"; {lin3_metric_summary[[1, 1]]} = {round(lin3_metric_summary[[1, 3]], 3)}")
)
pol1 is underestimating low 2010 log_pop_density values and overestimating most other values:
<- pol1_wf %>%
pol1_pred predict_on_holdout(., holdout, assess_df)
<- bind_rows(
pol1_metric_summary rsq(pol1_pred, truth = log_pop_density, estimate = .pred),
ccc(pol1_pred, truth = log_pop_density, estimate = .pred)
%>%
) mutate(model = unique(pol1_metric_summary$model))
%>%
pol1_pred ggplot(aes(log_pop_density, .pred, color = state)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
coord_fixed() +
labs(title = glue("{pol1_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: ",
"{pol1_metric_summary[[2, 1]]} = {round(pol1_metric_summary[[2, 3]], 3)}",
"; {pol1_metric_summary[[1, 1]]} = {round(pol1_metric_summary[[1, 3]], 3)}")
)
Like pol1, bs1 is underestimating low 2010 log_pop_density values and overestimating most other values:
<- bs_wf_best %>%
bs1_pred predict_on_holdout(., holdout, assess_df)
<- bind_rows(
bs1_metric_summary rsq(bs1_pred, truth = log_pop_density, estimate = .pred),
ccc(bs1_pred, truth = log_pop_density, estimate = .pred)
%>%
) mutate(model = unique(bs1_metric_summary$model))
%>%
bs1_pred ggplot(aes(log_pop_density, .pred, color = state)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
coord_fixed() +
labs(title = glue("{bs1_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: ",
"{bs1_metric_summary[[2, 1]]} = {round(bs1_metric_summary[[2, 3]], 3)}",
"; {bs1_metric_summary[[1, 1]]} = {round(bs1_metric_summary[[1, 3]], 3)}")
)
xg1 is underestimating low and high log_pop_density values:
<- xg1_fit_best %>%
xg1_pred predict_on_holdout(., holdout, assess_df)
<- bind_rows(
xg1_metric_summary rsq(xg1_pred, truth = log_pop_density, estimate = .pred),
ccc(xg1_pred, truth = log_pop_density, estimate = .pred)
%>%
) mutate(model = unique(xg1_metric_summary$model))
%>%
xg1_pred ggplot(aes(log_pop_density, .pred, color = state)) +
geom_abline(lty = 2, alpha = 0.5) +
geom_point(alpha = 0.15) +
coord_fixed() +
labs(title = glue("{xg1_metric_summary$model}"),
subtitle = glue("Actual and predicted log_pop_density: ",
"{xg1_metric_summary[[2, 1]]} = {round(xg1_metric_summary[[2, 3]], 3)}",
"; {xg1_metric_summary[[1, 1]]} = {round(xg1_metric_summary[[1, 3]], 3)}")
)
3.3.1 Summary: model performance on census 2010 data.
The simplest model lin3 perform slightly better, and our next-simplest odes. pol1, and bs1 are essentially equal. xg1 does worse at high and low log_pop_density values. This is a sign that xg1 may be overfitted on the training data. Even so, the difference in performance among the models are quite small.
Looking the absolute value of difference in means between predicted and true log_pop_density, I see all four models are performing similarly:
<- bind_rows(
all_means with(lin3_pred, t.test(.pred, log_pop_density, paired = TRUE)) %>% tidy() %>%
mutate(model = unique(lin3_metric_summary$model)),
with(xg1_pred, t.test(.pred, log_pop_density, paired = TRUE)) %>% tidy() %>%
mutate(model = unique(xg1_metric_summary$model)),
with(pol1_pred, t.test(.pred, log_pop_density, paired = TRUE)) %>% tidy() %>%
mutate(model = unique(pol1_metric_summary$model)),
with(bs1_pred, t.test(.pred, log_pop_density, paired = TRUE)) %>% tidy() %>%
mutate(model = unique(bs1_metric_summary$model))
%>%
) rename(diff_in_means = estimate) %>%
arrange(desc(abs(diff_in_means))) %>%
mutate(model = as_factor(model)) # creates levels with current ordering
%>%
all_means ggplot(aes(abs(diff_in_means), model)) +
geom_errorbarh(aes(xmin = abs(conf.low), xmax = abs(conf.high), height = 0.1)) +
geom_point() +
theme(legend.position = "none",
plot.title.position = "plot") +
labs(title = "Difference in means between predicted and\nactual log_pop_density (2010 data)",
subtitle = "T-test at 95% CI.",
x = "difference in means",
y = "")
<- bind_rows(
all_metrics
lin3_metric_summary,
pol1_metric_summary,
xg1_metric_summary,
bs1_metric_summary
)
= all_metrics %>%
model_levels filter(.metric == "ccc") %>%
arrange(.estimate) %>%
pull(model)
%>%
all_metrics mutate(model = factor(model, levels = model_levels)) %>%
ggplot(aes(.estimate, model)) +
geom_point() +
geom_label(aes(label = str_pad(round(.estimate, 3),
width = 5, side = "right", pad = "0")
),size = 3, nudge_x = -0.1, hjust = 1) +
expand_limits(x = 0) +
facet_wrap(~ .metric, scales = "free_x") +
theme(legend.position = "none",
plot.title.position = "plot") +
labs(title = "Model evaluation on 2010 data",
x = "metric estimate",
y = "")
%>%
all_metrics ::select(-.estimator) %>%
dplyrarrange(.metric, desc(.estimate)) %>%
gt() %>%
tab_header(
title = md("**Model evaluation on 2010 data**")
%>%
) fmt_number(columns = c(.estimate),
decimals = 3)
Model evaluation on 2010 data | ||
---|---|---|
.metric | .estimate | model |
ccc | 0.967 | lin3: log_pop_density ~ log_light_avg |
ccc | 0.957 | pol1: log_pop_density ~ poly(log_light_avg, 2) |
ccc | 0.957 | bs1: log_pop_density ~ bs(log_light_avg) |
ccc | 0.953 | xg1: log_pop_density ~ log_light_avg + state |
rsq | 0.945 | lin3: log_pop_density ~ log_light_avg |
rsq | 0.927 | bs1: log_pop_density ~ bs(log_light_avg) |
rsq | 0.926 | pol1: log_pop_density ~ poly(log_light_avg, 2) |
rsq | 0.915 | xg1: log_pop_density ~ log_light_avg + state |