3 Modeling and model comparison
As noted in the EDA section:
The relationship between \(log_{10}(pop\_density)\) and \(log_{10}(light\_avg)\) in figures 2.4 and 2.5) isn’t quite linear, and there are differences by state. So I expect simple linear models to perform worse than more sophisticated ones. Nonetheless, I want to see how good linear models are, since they are simple and inexpensive.
In this section I do the following:
- Prepare train-test and holdout data sets
 - Build (and where necessary, train) some models, then compare them using performance metrics
 - Test the best performing models against the holdout data set.
 
I use the following performance metrics:
- Concordance correlation coefficient (ccc) according the help for 
yardstick::ccc()“is a metric of both consistency/correlation and accuracy,” - R-Squared. I use 
yardstick::rsq()which according to the documentation “is simply the squared correlation between truth and estimate … [that] guarantees a value on \((0,1)\).” 
I use metrics that are scale-free, since I want the freedom to use models with different transformations of the data. The principle metric is ccc.
# For 2000
data_df <- tar_read(all_df) %>%
  #fix NAs
  mutate(atotal = if_else(!is.na(atotal), atotal, calc_area),
         pop_density_official = if_else(!is.na(pop_density_official), pop_density_official, pop_density)) %>%
  # drop a small number of data errors/artifacts
  filter(light_avg > 1e-5)# prepare data for train-test and holdout
d_df <- data_df %>%
  # dplyr::select(state:pop_density_official) %>%
  # avoid zero values
  mutate(log_light_avg = log10(light_avg + 1e-6),
         log_pop_density = log10(pop_density + 1e-6),
         log_pop_density_official = log10(pop_density_official + 1e-6)) %>%
  select(state, county, light_avg, log_light_avg, pop_density, log_pop_density, pop_density_official, log_pop_density_official)
# following https://cran.r-project.org/web/packages/rsample/vignettes/Working_with_rsets.html
set.seed(123)
main_split = initial_split(d_df, strata = c("state"))
train_split <- analysis(main_split)
holdout <- assessment(main_split)
light_folds <- vfold_cv(train_split, v = n_folds, repeats = 1)# For 2010
data_df_c <- tar_read(all_df_c) %>%
  #fix NAs
  mutate(atotal = if_else(!is.na(atotal), atotal, calc_area),
         pop_density_official = if_else(!is.na(pop_density_official), pop_density_official, pop_density)) %>%
  # drop a small number of data errors/artifacts
  filter(light_avg > 1e-5)# prepare data for train-test and holdout
assess_df <- data_df_c %>%
  # dplyr::select(state:pop_density_official) %>%
  # avoid zero values
  mutate(log_light_avg = log10(light_avg + 1e-6),
         log_pop_density = log10(pop_density + 1e-6),
         log_pop_density_official = log10(pop_density_official + 1e-6)) %>%
  select(state, county, light_avg, log_light_avg, pop_density, log_pop_density, pop_density_official, log_pop_density_official)
set.seed(2021)# inspiration/source: Dave Robinson's coding videos; see https://github.com/dgrtwo/data-screencasts
prep_juice <- function(x) juice(prep(x))
mset <- metric_set(rsq, ccc) # use scale-free metrics (not rmse)
grid_control <- control_grid(save_pred = TRUE,
                             save_workflow = TRUE,
                             extract = extract_model)
augment.workflow <- function(x, newdata, ...) {
  predict(x, newdata, ...) %>%
    bind_cols(newdata)
}
predict_on_holdout <- function(workflow, tbl_train, tbl_test) {
  workflow %>%
    fit(tbl_train) %>%
    augment(tbl_test)
}
# for use in fit_resamples()
keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE)3.1 Train and test on training data set
3.1.1 lin1: pop_density ~ light_avg
Let’s start simple. In lin1 the data is concentrated in smaller radiance values, and the model is depressing the slope of the line due to outliers at high pop_density.
lm_model <- 
  linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")
my_formula <- "pop_density ~ light_avg"
lin_rec <- recipe(
  data = analysis(light_folds$splits[[1]]),
  formula =  as.formula(my_formula))
  
lin1_wf <-
  workflow() %>%
  add_recipe(lin_rec) %>%
  add_model(lm_model)
lin1_res <- lin1_wf %>% 
  fit_resamples(resamples = light_folds,
                metrics = mset,
                control = keep_pred)
lin1_metric_summary <- lin1_res %>%
  collect_metrics() %>%
  mutate(model = glue("lin1: {my_formula}")) %>%
  dplyr::select(".metric", .estimate = "mean", "n", "std_err", "model")
metric_summary_for_plot <- glue("{lin1_metric_summary[[1, 1]]} = {round(lin1_metric_summary[[1, 2]], 3)}",
                       "; {lin1_metric_summary[[2, 1]]} = {round(lin1_metric_summary[[2, 2]], 3)}")
lin1_res %>%
  collect_predictions() %>%
  ggplot(aes(x = pop_density, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  labs(title = glue("{lin1_metric_summary$model}"),
       subtitle = glue("Actual and predicted pop_density: {metric_summary_for_plot}")
  )
Figure 3.1: lin1: Simplest linear model
3.1.2 lin2: pop_density_official ~ light_avg
Should I use official area values from the US Census Bureau to calculate population density instead of area values calculated based on the county boundaries? In a word: no. Using official area values, rsq and ccc are slightly lower than lin 1 (figure 3.1), perhaps because pop_density is calculated using the same boundaries I’m using to determine which light pixels are part of each county, and therefore I’m dealing with similar error in both. I use the area values I calculated in all other models.
my_formula <- "pop_density_official ~ light_avg"
lin_rec2 <- recipe(
  data = analysis(light_folds$splits[[1]]),
  formula =  as.formula(my_formula))
  
lin2_wf <-
  workflow() %>%
  add_recipe(lin_rec2) %>%
  add_model(lm_model) # defined above
lin2_res <- lin2_wf %>% 
  fit_resamples(resamples = light_folds,
                metrics = mset,
                control = keep_pred)
lin2_metric_summary <- lin2_res %>%
  collect_metrics() %>%
  mutate(model = glue("lin2: {my_formula}")) %>%
  dplyr::select(".metric", .estimate = "mean", "n", "std_err", "model")
metric_summary_for_plot <- glue("{lin2_metric_summary[[1, 1]]} = {round(lin2_metric_summary[[1, 2]], 3)}",
                       "; {lin2_metric_summary[[2, 1]]} = {round(lin2_metric_summary[[2, 2]], 3)}")
lin2_res %>%
  collect_predictions() %>%
  ggplot(aes(x = pop_density_official, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  labs(title = glue("{lin2_metric_summary$model}"),
       subtitle = glue("Actual and predicted pop_density_official: {metric_summary_for_plot}")
  )
Figure 3.2: lin2: Using official area values to calculate population density
3.1.3 lin3: log_pop_density ~ log_light_avg
With both predictor and response on a log scale, performance metrics are better. Compare with lin1 in Figure 3.1.
my_formula <- "log_pop_density ~ log_light_avg"
lin_rec3 <- recipe(
  data = analysis(light_folds$splits[[1]]),
  formula =  as.formula(my_formula))
  
lin3_wf <-
  workflow() %>%
  add_recipe(lin_rec3) %>%
  add_model(lm_model) # defined above
lin3_res <- lin3_wf %>% 
  fit_resamples(resamples = light_folds,
                metrics = mset,
                control = keep_pred)
lin3_metric_summary <- lin3_res %>%
  collect_metrics() %>%
  mutate(model = glue("lin3: {my_formula}")) %>%
  dplyr::select(".metric", .estimate = "mean", "n", "std_err", "model")
metric_summary_for_plot <- glue("{lin3_metric_summary[[1, 1]]} = {round(lin3_metric_summary[[1, 2]], 3)}",
                       "; {lin3_metric_summary[[2, 1]]} = {round(lin3_metric_summary[[2, 2]], 3)}")
lin3_res %>%
  collect_predictions() %>%
  ggplot(aes(x = log_pop_density, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  # coord_obs_pred() +
  labs(title = glue("{lin3_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
  )
Figure 3.3: lin3: Simplest linear model with log10 transformations
3.1.4 pol1: log_light_avg ~ poly(log_pop_density, 2)
Since the data in Figure 2.4 looks like it has close to a quadratic relationship (a convex curve), let’s try a second order polynomial. Performance is quite good.
my_formula_basic <- "log_pop_density ~ log_light_avg"
my_formula <- "log_pop_density ~ poly(log_light_avg, 2)"
pol1_rec <- recipe(
  data = analysis(light_folds$splits[[1]]),
  formula =  as.formula(my_formula_basic)) %>%
  step_poly(log_light_avg, degree = 2)
  
pol1_wf <-
  workflow() %>%
  add_recipe(pol1_rec) %>%
  add_model(lm_model) # defined above
pol1_res <- pol1_wf %>% 
  fit_resamples(resamples = light_folds,
                metrics = mset,
                control = keep_pred)
pol1_metric_summary <- pol1_res %>%
  collect_metrics() %>%
  mutate(model = glue("pol1: {my_formula}"))  %>%
  dplyr::select(".metric", .estimate = "mean", "n", "std_err", "model")
metric_summary_for_plot <- glue("{pol1_metric_summary[[1, 1]]} = {round(pol1_metric_summary[[1, 2]], 3)}",
                       "; {pol1_metric_summary[[2, 1]]} = {round(pol1_metric_summary[[2, 2]], 3)}")
pol1_res %>% 
  collect_predictions() %>%
  ggplot(aes(x = log_pop_density, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  labs(title = glue("{pol1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
  )
Figure 3.4: pol1: Linear model with second order polynomial
3.1.5 lme1: multi-level model with lme4::lmer
Since states differ in the range and distribution of radiance values, are there improvements if I use a fixed effects model “log_pop_density ~ log_light_avg + (log_light_avg | state)?” Unfortunately not. There is no meaningful improvement over Figure 3.3. Apparently state doesn’t hold enough unique information to be helpful.
lme_model <- 
  linear_reg() %>%
  set_engine("lmer") %>%
  set_mode("regression")
my_formula <- "log_pop_density ~ log_light_avg + (log_light_avg | state)"
lme1_wf <-
  workflow() %>%
  add_variables(outcomes = log_pop_density,
                predictors = c(log_light_avg, state)
                ) %>%
  add_model(
    lme_model,
    formula = as.formula(my_formula)
    )
lme1_res <- lme1_wf %>% 
  fit_resamples(resamples = light_folds,
                metrics = mset,
                control = keep_pred)
lme1_metric_summary <- lme1_res %>%
  collect_metrics() %>%
  mutate(model = glue("lme1: {my_formula}")) %>%
  dplyr::select(".metric", .estimate = "mean", "n", "std_err", "model")
metric_summary_for_plot <- glue("{lme1_metric_summary[[1, 1]]} = {round(lme1_metric_summary[[1, 2]], 3)}",
                                "; {lme1_metric_summary[[2, 1]]} = {round(lme1_metric_summary[[2, 2]], 3)}")
lme1_res %>%
  collect_predictions() %>%
  ggplot(aes(x = log_pop_density, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  theme(plot.title = element_text(size = 14)) +
  labs(title = glue("{lme1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
  )
Figure 3.5: lme1: Multi-level model using ‘state’
3.1.6 bs1: linear model with b-spline
Since lin3 (figure 3.3) seems to be overestimating high and low radiance, let’s try a spline. First I tune the model to determine how many knots to include in the spline:
bs_rec <- recipe(log_pop_density ~ log_light_avg,
                  data = analysis(light_folds$splits[[1]])) %>%
  step_ns(log_light_avg, deg_free = tune("log_light_avg"))
bs_model <- 
  linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")
my_formula = "log_pop_density ~ bs(log_light_avg)"
bs1_wf <-
  workflow() %>%
  add_recipe(bs_rec) %>%
  add_model(bs_model) 
k_grid <- tibble(log_light_avg = 1:6)
bs_tune <- bs1_wf %>%
  tune_grid(resamples = light_folds,
            grid = k_grid,
            metrics = mset,
            control = grid_control)autoplot(bs_tune) +
  labs(title = "Tuning bs1")
Figure 3.6: Tuning bs1
I’ll use 2 knots, which maximizes both ccc and rsq.
bs_wf_best <- bs1_wf %>%
  finalize_workflow(select_best(bs_tune, metric = "ccc"))
select_best(bs_tune, metric = "ccc") %>%
  gt()| log_light_avg | .config | 
|---|---|
| 2 | Preprocessor2_Model1 | 
Both ccc and rsq are slightly better than lin3, and the high and low predictions are not consistently overestimated:
bs1_res <- bs_wf_best %>% 
  fit_resamples(resamples = light_folds,
                metrics = mset,
                control = keep_pred)
bs1_metric_summary <- bs1_res %>% collect_metrics() %>%
  mutate(model = glue("bs1: {my_formula}")) %>%
  dplyr::select(".metric", .estimate = "mean", "n", "std_err", "model")
metric_summary_for_plot <- glue("{bs1_metric_summary[[1, 1]]} = {round(bs1_metric_summary[[1, 2]], 3)}",
                                "; {bs1_metric_summary[[2, 1]]} = {round(bs1_metric_summary[[2, 2]], 3)}")
bs1_res %>%
  collect_predictions() %>%
  ggplot(aes(x = log_pop_density, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  theme(plot.title = element_text(size = 14)) +
  labs(title = glue("{bs1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
  )
Figure 3.7: bs1: Linear model with B-spline to capture curvature
3.1.7 XGBoost: log_pop_density ~ log_light_avg + state
Now a non-linear model and one that needs training: boosted trees using XGBoost. In this model state categories are dummy variables.
set.seed(2021)
xg1_rec <- recipe(log_pop_density ~ log_light_avg + state,
                  data = analysis(light_folds$splits[[1]])) %>%
  step_dummy(state)
xg1_wf <- workflow() %>%
  add_recipe(xg1_rec) %>%
  add_model(boost_tree(mode = "regression",
                       mtry = tune(),
                       trees = tune(),
                       # tree_depth = , # default is 6
                       learn_rate = .01) %>% set_engine("xgboost"))
xg1_tune <- xg1_wf %>%
  tune_grid(light_folds,
            grid = crossing(mtry = c(2, 5, 8, 11),
                       trees = seq(50, 400, 50),
                       #threshold = c(0.01) # default is 0.01
                       ),
            metrics = mset,
            control = grid_control)Tuning the model: results suggest using between 250 and 400 trees with nearly equal performance including 5, 8 or 11 variables at each split:
autoplot(xg1_tune) +
  labs(title = "Tuning xg1")
Figure 3.8: Tuning xg1
The best fit includes 400 trees and using \(mtry = 8\) to randomly sample 8 variables at each split (which is nearly all of them in this case). The benefit of 400 trees versus 300 is very small, and given we have ~650 data points for training (before considering that we are cross-validating with 10 folds), I use the lesser number. Fewer trees reduces the chance of overfitting.
xg1_wf_best <- xg1_wf %>%
  finalize_workflow(select_best(xg1_tune, metric = "ccc")) %>%
  update_model(boost_tree(mode = "regression",
                       mtry = 8,
                       trees = 300,
                       # tree_depth = 3,
                       learn_rate = .01) %>% set_engine("xgboost"))So I use these:
print(xg1_wf_best)## ══ Workflow ═════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ─────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_dummy()
## 
## ── Model ────────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = 8
##   trees = 300
##   learn_rate = 0.01
## 
## Computational engine: xgboost
Summary metrics are good, however the model is underestimating most radiance values:
my_formula <- "log_pop_density ~ log_light_avg + state"
xg1_fit_best <- xg1_wf_best %>%
  fit(train_split)
xg1_res <- xg1_fit_best %>%
  last_fit(main_split, metrics = mset)
xg1_metric_summary <- xg1_res %>%
  collect_metrics() %>%
  mutate(model = glue("xg1: {my_formula}")) %>%
  mutate(n = n_folds,
         std_err = NA_real_) %>%
  dplyr::select(.metric, .estimate, n, std_err, model)
metric_summary_for_plot <- glue("{xg1_metric_summary[[2, 1]]} = {round(xg1_metric_summary[[2, 2]], 3)}",
                       "; {xg1_metric_summary[[1, 1]]} = {round(xg1_metric_summary[[1, 2]], 3)}")
xg1_res %>%
  collect_predictions() %>%
  ggplot(aes(x = log_pop_density, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  theme(plot.title = element_text(size = 14)) +
  labs(title = glue("{xg1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
  )
Figure 3.9: xg1: Random forest with boosted graidient descent
3.1.8 Summary of train-test results
As expected, the highest-performing model is not the simplest. lin1 and lin2 are the two worst-performing models; I exclude them from the next stage. The others (simpler and more complex) perform similarly.
all_metrics <- bind_rows(
  lin1_metric_summary,
  lin2_metric_summary,
  lin3_metric_summary,
  pol1_metric_summary,
  xg1_metric_summary,
  lme1_metric_summary,
  bs1_metric_summary,
  NULL
) %>%
  filter(.metric %in% c("ccc", "rsq"))
model_levels = all_metrics %>%
  filter(.metric == "ccc") %>%
  arrange(.estimate) %>%
  pull(model)
all_metrics %>%
  mutate(model = factor(model, levels = model_levels)) %>%
  ggplot(aes(.estimate, model)) +
  geom_errorbarh(aes(xmax = .estimate + std_err, xmin = .estimate - std_err),
                height = 0.2) +
  geom_point() +
  expand_limits(x = 1.0) +
  facet_wrap( ~ .metric, nrow = 1) +
  theme(plot.title.position = "plot") +
  labs(title = glue("Comparing model performance for metrics ",
                    "{glue_collapse(sort(unique(all_metrics$.metric)), sep = ', ')}"),
       subtitle = "Performance on train-test set",
       x = "mean metric estimate across all folds; error bars +/- std_err")
Figure 3.10: Summary of train-test results
all_metrics %>%
  arrange(.metric, desc(.estimate)) %>%
  gt() %>%
  tab_header(
    title = md("**Model evaluation on train-test data set**")
    ) %>%
  fmt_number(columns = c(.estimate, std_err),
             decimals = 3) %>%
  fmt_missing(columns = everything(),
              missing_text = "---") %>%
  cols_align(columns = model, align = "left")| Model evaluation on train-test data set | ||||
|---|---|---|---|---|
| .metric | .estimate | n | std_err | model | 
| ccc | 0.966 | 10 | — | xg1: log_pop_density ~ log_light_avg + state | 
| ccc | 0.953 | 10 | 0.004 | bs1: log_pop_density ~ bs(log_light_avg) | 
| ccc | 0.953 | 10 | 0.004 | pol1: log_pop_density ~ poly(log_light_avg, 2) | 
| ccc | 0.948 | 10 | 0.004 | lme1: log_pop_density ~ log_light_avg + (log_light_avg | state) | 
| ccc | 0.947 | 10 | 0.005 | lin3: log_pop_density ~ log_light_avg | 
| ccc | 0.917 | 10 | 0.009 | lin1: pop_density ~ light_avg | 
| ccc | 0.893 | 10 | 0.024 | lin2: pop_density_official ~ light_avg | 
| rsq | 0.955 | 10 | — | xg1: log_pop_density ~ log_light_avg + state | 
| rsq | 0.942 | 10 | 0.008 | bs1: log_pop_density ~ bs(log_light_avg) | 
| rsq | 0.942 | 10 | 0.008 | pol1: log_pop_density ~ poly(log_light_avg, 2) | 
| rsq | 0.931 | 10 | 0.008 | lin3: log_pop_density ~ log_light_avg | 
| rsq | 0.931 | 10 | 0.008 | lme1: log_pop_density ~ log_light_avg + (log_light_avg | state) | 
| rsq | 0.931 | 10 | 0.020 | lin1: pop_density ~ light_avg | 
| rsq | 0.922 | 10 | 0.022 | lin2: pop_density_official ~ light_avg | 
3.2 Assessing model performance with the holdout data set
In this section I use data the models haven’t seen yet from 2000 (the holdout data set).
lin3 summary metrics are quite good, however again the curvature in the data is visible in the way the model is overestimating high and low radiance values:
lin3_pred <- lin3_wf %>%
  predict_on_holdout(., train_split, holdout)
lin3_metric_summary <- bind_rows(
  rsq(lin3_pred, truth = log_pop_density, estimate = .pred),
  ccc(lin3_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(lin3_metric_summary$model))
lin3_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{lin3_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{lin3_metric_summary[[2, 1]]} = {round(lin3_metric_summary[[2, 3]], 3)}",
                       "; {lin3_metric_summary[[1, 1]]} = {round(lin3_metric_summary[[1, 3]], 3)}")
  )
Figure 3.11: lin3 using holdout set
pol1 looks good but is slightly overestimating high radiance values:
pol1_pred <- pol1_wf %>%
  predict_on_holdout(., train_split, holdout)
pol1_metric_summary <- bind_rows(
  rsq(lin3_pred, truth = log_pop_density, estimate = .pred),
  ccc(lin3_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(pol1_metric_summary$model))
pol1_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{pol1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{pol1_metric_summary[[2, 1]]} = {round(pol1_metric_summary[[2, 3]], 3)}",
                       "; {pol1_metric_summary[[1, 1]]} = {round(pol1_metric_summary[[1, 3]], 3)}")
  )
Figure 3.12: pol1 using holdout set
Again lme1 fails to do better than lin3:
# error=TRUE to continue execution in case model fails to converge
lme1_pred <- lme1_wf %>%
  predict_on_holdout(., train_split, holdout)
lme1_metric_summary <- bind_rows(
  rsq(lme1_pred, truth = log_pop_density, estimate = .pred),
  ccc(lme1_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(lme1_metric_summary$model))
lme1_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{lme1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{lme1_metric_summary[[2, 1]]} = {round(lme1_metric_summary[[2, 3]], 3)}",
                       "; {lme1_metric_summary[[1, 1]]} = {round(lme1_metric_summary[[1, 3]], 3)}")
  )
Figure 3.13: lme1 using holdout set
Again bs1 is performing quite well:
bs1_pred <- bs_wf_best %>%
  predict_on_holdout(., train_split, holdout)
bs1_metric_summary <- bind_rows(
  rsq(bs1_pred, truth = log_pop_density, estimate = .pred),
  ccc(bs1_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(bs1_metric_summary$model))
bs1_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{bs1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{bs1_metric_summary[[2, 1]]} = {round(bs1_metric_summary[[2, 3]], 3)}",
                       "; {bs1_metric_summary[[1, 1]]} = {round(bs1_metric_summary[[1, 3]], 3)}")
  )
Figure 3.14: bs1 using holdout set
xg1 performance metrics are good, but the model is again systematically underestimating:
xg1_pred <- xg1_fit_best %>%
  predict_on_holdout(., train_split, holdout)
xg1_metric_summary <- bind_rows(
  rsq(xg1_pred, truth = log_pop_density, estimate = .pred),
  ccc(xg1_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(xg1_metric_summary$model))
xg1_pred %>%
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{xg1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{xg1_metric_summary[[2, 1]]} = {round(xg1_metric_summary[[2, 3]], 3)}",
                       "; {xg1_metric_summary[[1, 1]]} = {round(xg1_metric_summary[[1, 3]], 3)}")
  )
Figure 3.15: xg1 using holdout set
3.2.1 Summary: model evaluation using holdout data set
Results are similar for all models. The very simple lin3 is very competitive. I exclude lme1 from the next stage.
all_metrics <- bind_rows(
  lin3_metric_summary,
  pol1_metric_summary,
  lme1_metric_summary,
  bs1_metric_summary,
  xg1_metric_summary
)
model_levels = all_metrics %>%
  filter(.metric == "ccc") %>%
  arrange(.estimate) %>%
  pull(model)
all_metrics %>%
  mutate(model = factor(model, levels = model_levels)) %>%
  ggplot(aes(.estimate, model)) +
  geom_point() +
  geom_label(aes(label = str_pad(round(.estimate, 3), 
                                 width = 5, side = "right", pad = "0")
                 ),
             size = 3, nudge_x = -0.1, hjust = 1) +
  expand_limits(x = 0) +
  facet_wrap(~ .metric, scales = "free_x") +
  theme(legend.position = "none",
        plot.title.position = "plot") +
  labs(title = "Model evaluation on holdout data set",
       x = "metric estimate",
       y = "")
Figure 3.16: Summary of holdout results
all_metrics %>%
  dplyr::select(-.estimator) %>%
  arrange(.metric, desc(.estimate)) %>%
  gt() %>%
  tab_header(
  title = md("**Model evaluation on holdout data set**")
  ) %>%
  fmt_number(columns = .estimate,
             decimals = 3)| Model evaluation on holdout data set | ||
|---|---|---|
| .metric | .estimate | model | 
| ccc | 0.972 | bs1: log_pop_density ~ bs(log_light_avg) | 
| ccc | 0.966 | xg1: log_pop_density ~ log_light_avg + state | 
| ccc | 0.963 | lin3: log_pop_density ~ log_light_avg | 
| ccc | 0.963 | pol1: log_pop_density ~ poly(log_light_avg, 2) | 
| ccc | 0.963 | lme1: log_pop_density ~ log_light_avg + (log_light_avg | state) | 
| rsq | 0.955 | xg1: log_pop_density ~ log_light_avg + state | 
| rsq | 0.954 | bs1: log_pop_density ~ bs(log_light_avg) | 
| rsq | 0.937 | lme1: log_pop_density ~ log_light_avg + (log_light_avg | state) | 
| rsq | 0.937 | lin3: log_pop_density ~ log_light_avg | 
| rsq | 0.937 | pol1: log_pop_density ~ poly(log_light_avg, 2) | 
3.3 Comparing 2000 and 2010
I use the four remaining models to predict changes in log_pop_density in 2010 based on average county radiance log_light_avg in the same year, comparing performance on the holdout set with the 2010 assessment set.
lin3 looks pretty good, but the model overestimates high 2010 log_pop_density values.
lin3_pred <- lin3_wf %>%
  predict_on_holdout(., holdout, assess_df)
lin3_metric_summary <- bind_rows(
  rsq(lin3_pred, truth = log_pop_density, estimate = .pred),
  ccc(lin3_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(lin3_metric_summary$model))
lin3_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{lin3_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{lin3_metric_summary[[2, 1]]} = {round(lin3_metric_summary[[2, 3]], 3)}",
                       "; {lin3_metric_summary[[1, 1]]} = {round(lin3_metric_summary[[1, 3]], 3)}")
  )
Figure 3.17: lin3 2010 assess set results
pol1 is underestimating low 2010 log_pop_density values and overestimating most other values:
pol1_pred <- pol1_wf %>%
  predict_on_holdout(., holdout, assess_df)
pol1_metric_summary <- bind_rows(
  rsq(pol1_pred, truth = log_pop_density, estimate = .pred),
  ccc(pol1_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(pol1_metric_summary$model))
pol1_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{pol1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{pol1_metric_summary[[2, 1]]} = {round(pol1_metric_summary[[2, 3]], 3)}",
                       "; {pol1_metric_summary[[1, 1]]} = {round(pol1_metric_summary[[1, 3]], 3)}")
  )
Figure 3.18: pol1 2010 assess set results
Like pol1, bs1 is underestimating low 2010 log_pop_density values and overestimating most other values:
bs1_pred <- bs_wf_best %>%
  predict_on_holdout(., holdout, assess_df)
bs1_metric_summary <- bind_rows(
  rsq(bs1_pred, truth = log_pop_density, estimate = .pred),
  ccc(bs1_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(bs1_metric_summary$model))
bs1_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{bs1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{bs1_metric_summary[[2, 1]]} = {round(bs1_metric_summary[[2, 3]], 3)}",
                       "; {bs1_metric_summary[[1, 1]]} = {round(bs1_metric_summary[[1, 3]], 3)}")
       
  )
Figure 3.19: bs1 2010 assess set results
xg1 is underestimating low and high log_pop_density values:
xg1_pred <- xg1_fit_best %>%
  predict_on_holdout(., holdout, assess_df)
xg1_metric_summary <- bind_rows(
  rsq(xg1_pred, truth = log_pop_density, estimate = .pred),
  ccc(xg1_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(xg1_metric_summary$model))
xg1_pred %>%
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{xg1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{xg1_metric_summary[[2, 1]]} = {round(xg1_metric_summary[[2, 3]], 3)}",
                       "; {xg1_metric_summary[[1, 1]]} = {round(xg1_metric_summary[[1, 3]], 3)}")
  )
Figure 3.20: xg1 2010 assess set results
3.3.1 Summary: model performance on census 2010 data.
The simplest model lin3 perform slightly better, and our next-simplest odes. pol1, and bs1 are essentially equal. xg1 does worse at high and low log_pop_density values. This is a sign that xg1 may be overfitted on the training data. Even so, the difference in performance among the models are quite small.
Looking the absolute value of difference in means between predicted and true log_pop_density, I see all four models are performing similarly:
all_means <- bind_rows(
  with(lin3_pred, t.test(.pred, log_pop_density, paired = TRUE)) %>% tidy() %>% 
    mutate(model = unique(lin3_metric_summary$model)),
  with(xg1_pred, t.test(.pred, log_pop_density, paired = TRUE)) %>% tidy() %>% 
    mutate(model = unique(xg1_metric_summary$model)),
  with(pol1_pred, t.test(.pred, log_pop_density, paired = TRUE)) %>% tidy() %>% 
    mutate(model = unique(pol1_metric_summary$model)),
  with(bs1_pred, t.test(.pred, log_pop_density, paired = TRUE)) %>% tidy() %>% 
    mutate(model = unique(bs1_metric_summary$model))
) %>%
  rename(diff_in_means = estimate) %>%
  arrange(desc(abs(diff_in_means))) %>%
  mutate(model = as_factor(model)) # creates levels with current ordering
all_means %>%
  ggplot(aes(abs(diff_in_means), model)) +
  geom_errorbarh(aes(xmin = abs(conf.low), xmax = abs(conf.high), height = 0.1)) +
  geom_point() +
  theme(legend.position = "none",
        plot.title.position = "plot") +
  labs(title = "Difference in means between predicted and\nactual log_pop_density (2010 data)",
       subtitle = "T-test at 95% CI.",
       x = "difference in means",
       y = "")
Figure 3.21: Differences in the means (predicted v actual)
all_metrics <- bind_rows(
  lin3_metric_summary,
  pol1_metric_summary,
  xg1_metric_summary,
  bs1_metric_summary
)
model_levels = all_metrics %>%
  filter(.metric == "ccc") %>%
  arrange(.estimate) %>%
  pull(model)
all_metrics %>%
  mutate(model = factor(model, levels = model_levels)) %>%
  ggplot(aes(.estimate, model)) +
  geom_point() +
  geom_label(aes(label = str_pad(round(.estimate, 3), 
                                 width = 5, side = "right", pad = "0")
                 ),
             size = 3, nudge_x = -0.1, hjust = 1) +
  expand_limits(x = 0) +
  facet_wrap(~ .metric, scales = "free_x") +
  theme(legend.position = "none",
        plot.title.position = "plot") +
  labs(title = "Model evaluation on 2010 data",
       x = "metric estimate",
       y = "")
Figure 3.22: Holdout set results
all_metrics %>%
  dplyr::select(-.estimator) %>%
  arrange(.metric, desc(.estimate)) %>%
  gt() %>%
  tab_header(
  title = md("**Model evaluation on 2010 data**")
  ) %>%
  fmt_number(columns = c(.estimate),
             decimals = 3)| Model evaluation on 2010 data | ||
|---|---|---|
| .metric | .estimate | model | 
| ccc | 0.967 | lin3: log_pop_density ~ log_light_avg | 
| ccc | 0.957 | pol1: log_pop_density ~ poly(log_light_avg, 2) | 
| ccc | 0.957 | bs1: log_pop_density ~ bs(log_light_avg) | 
| ccc | 0.953 | xg1: log_pop_density ~ log_light_avg + state | 
| rsq | 0.945 | lin3: log_pop_density ~ log_light_avg | 
| rsq | 0.927 | bs1: log_pop_density ~ bs(log_light_avg) | 
| rsq | 0.926 | pol1: log_pop_density ~ poly(log_light_avg, 2) | 
| rsq | 0.915 | xg1: log_pop_density ~ log_light_avg + state | 
 This work is licensed under a