3 Modeling and model comparison

As noted in the EDA section:

The relationship between \(log_{10}(pop\_density)\) and \(log_{10}(light\_avg)\) in figures 2.4 and 2.5) isn’t quite linear, and there are differences by state. So I expect simple linear models to perform worse than more sophisticated ones. Nonetheless, I want to see how good linear models are, since they are simple and inexpensive.

In this section I do the following:

  • Prepare train-test and holdout data sets
  • Build (and where necessary, train) some models, then compare them using performance metrics
  • Test the best performing models against the holdout data set.

I use the following performance metrics:

  • Concordance correlation coefficient (ccc) according the help for yardstick::ccc() “is a metric of both consistency/correlation and accuracy,”
  • R-Squared. I use yardstick::rsq() which according to the documentation “is simply the squared correlation between truth and estimate … [that] guarantees a value on \((0,1)\).”

I use metrics that are scale-free, since I want the freedom to use models with different transformations of the data. The principle metric is ccc.

# For 2000
data_df <- tar_read(all_df) %>%
  #fix NAs
  mutate(atotal = if_else(!is.na(atotal), atotal, calc_area),
         pop_density_official = if_else(!is.na(pop_density_official), pop_density_official, pop_density)) %>%
  # drop a small number of data errors/artifacts
  filter(light_avg > 1e-5)
# prepare data for train-test and holdout
d_df <- data_df %>%
  # dplyr::select(state:pop_density_official) %>%
  # avoid zero values
  mutate(log_light_avg = log10(light_avg + 1e-6),
         log_pop_density = log10(pop_density + 1e-6),
         log_pop_density_official = log10(pop_density_official + 1e-6)) %>%
  select(state, county, light_avg, log_light_avg, pop_density, log_pop_density, pop_density_official, log_pop_density_official)

# following https://cran.r-project.org/web/packages/rsample/vignettes/Working_with_rsets.html
set.seed(123)
main_split = initial_split(d_df, strata = c("state"))
train_split <- analysis(main_split)
holdout <- assessment(main_split)

light_folds <- vfold_cv(train_split, v = n_folds, repeats = 1)
# For 2010
data_df_c <- tar_read(all_df_c) %>%
  #fix NAs
  mutate(atotal = if_else(!is.na(atotal), atotal, calc_area),
         pop_density_official = if_else(!is.na(pop_density_official), pop_density_official, pop_density)) %>%
  # drop a small number of data errors/artifacts
  filter(light_avg > 1e-5)
# prepare data for train-test and holdout
assess_df <- data_df_c %>%
  # dplyr::select(state:pop_density_official) %>%
  # avoid zero values
  mutate(log_light_avg = log10(light_avg + 1e-6),
         log_pop_density = log10(pop_density + 1e-6),
         log_pop_density_official = log10(pop_density_official + 1e-6)) %>%
  select(state, county, light_avg, log_light_avg, pop_density, log_pop_density, pop_density_official, log_pop_density_official)

set.seed(2021)
# inspiration/source: Dave Robinson's coding videos; see https://github.com/dgrtwo/data-screencasts
prep_juice <- function(x) juice(prep(x))

mset <- metric_set(rsq, ccc) # use scale-free metrics (not rmse)

grid_control <- control_grid(save_pred = TRUE,
                             save_workflow = TRUE,
                             extract = extract_model)

augment.workflow <- function(x, newdata, ...) {
  predict(x, newdata, ...) %>%
    bind_cols(newdata)
}

predict_on_holdout <- function(workflow, tbl_train, tbl_test) {
  workflow %>%
    fit(tbl_train) %>%
    augment(tbl_test)
}

# for use in fit_resamples()
keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE)


3.1 Train and test on training data set

3.1.1 lin1: pop_density ~ light_avg

Let’s start simple. In lin1 the data is concentrated in smaller radiance values, and the model is depressing the slope of the line due to outliers at high pop_density.

lm_model <- 
  linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")

my_formula <- "pop_density ~ light_avg"

lin_rec <- recipe(
  data = analysis(light_folds$splits[[1]]),
  formula =  as.formula(my_formula))
  
lin1_wf <-
  workflow() %>%
  add_recipe(lin_rec) %>%
  add_model(lm_model)

lin1_res <- lin1_wf %>% 
  fit_resamples(resamples = light_folds,
                metrics = mset,
                control = keep_pred)

lin1_metric_summary <- lin1_res %>%
  collect_metrics() %>%
  mutate(model = glue("lin1: {my_formula}")) %>%
  dplyr::select(".metric", .estimate = "mean", "n", "std_err", "model")

metric_summary_for_plot <- glue("{lin1_metric_summary[[1, 1]]} = {round(lin1_metric_summary[[1, 2]], 3)}",
                       "; {lin1_metric_summary[[2, 1]]} = {round(lin1_metric_summary[[2, 2]], 3)}")

lin1_res %>%
  collect_predictions() %>%
  ggplot(aes(x = pop_density, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  labs(title = glue("{lin1_metric_summary$model}"),
       subtitle = glue("Actual and predicted pop_density: {metric_summary_for_plot}")
  )
lin1: Simplest linear model

Figure 3.1: lin1: Simplest linear model


3.1.2 lin2: pop_density_official ~ light_avg

Should I use official area values from the US Census Bureau to calculate population density instead of area values calculated based on the county boundaries? In a word: no. Using official area values, rsq and ccc are slightly lower than lin 1 (figure 3.1), perhaps because pop_density is calculated using the same boundaries I’m using to determine which light pixels are part of each county, and therefore I’m dealing with similar error in both. I use the area values I calculated in all other models.

my_formula <- "pop_density_official ~ light_avg"

lin_rec2 <- recipe(
  data = analysis(light_folds$splits[[1]]),
  formula =  as.formula(my_formula))
  
lin2_wf <-
  workflow() %>%
  add_recipe(lin_rec2) %>%
  add_model(lm_model) # defined above

lin2_res <- lin2_wf %>% 
  fit_resamples(resamples = light_folds,
                metrics = mset,
                control = keep_pred)

lin2_metric_summary <- lin2_res %>%
  collect_metrics() %>%
  mutate(model = glue("lin2: {my_formula}")) %>%
  dplyr::select(".metric", .estimate = "mean", "n", "std_err", "model")

metric_summary_for_plot <- glue("{lin2_metric_summary[[1, 1]]} = {round(lin2_metric_summary[[1, 2]], 3)}",
                       "; {lin2_metric_summary[[2, 1]]} = {round(lin2_metric_summary[[2, 2]], 3)}")

lin2_res %>%
  collect_predictions() %>%
  ggplot(aes(x = pop_density_official, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  labs(title = glue("{lin2_metric_summary$model}"),
       subtitle = glue("Actual and predicted pop_density_official: {metric_summary_for_plot}")
  )
lin2: Using official area values to calculate population density

Figure 3.2: lin2: Using official area values to calculate population density

3.1.3 lin3: log_pop_density ~ log_light_avg

With both predictor and response on a log scale, performance metrics are better. Compare with lin1 in Figure 3.1.

my_formula <- "log_pop_density ~ log_light_avg"

lin_rec3 <- recipe(
  data = analysis(light_folds$splits[[1]]),
  formula =  as.formula(my_formula))
  
lin3_wf <-
  workflow() %>%
  add_recipe(lin_rec3) %>%
  add_model(lm_model) # defined above

lin3_res <- lin3_wf %>% 
  fit_resamples(resamples = light_folds,
                metrics = mset,
                control = keep_pred)

lin3_metric_summary <- lin3_res %>%
  collect_metrics() %>%
  mutate(model = glue("lin3: {my_formula}")) %>%
  dplyr::select(".metric", .estimate = "mean", "n", "std_err", "model")

metric_summary_for_plot <- glue("{lin3_metric_summary[[1, 1]]} = {round(lin3_metric_summary[[1, 2]], 3)}",
                       "; {lin3_metric_summary[[2, 1]]} = {round(lin3_metric_summary[[2, 2]], 3)}")

lin3_res %>%
  collect_predictions() %>%
  ggplot(aes(x = log_pop_density, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  # coord_obs_pred() +
  labs(title = glue("{lin3_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
  )
lin3: Simplest linear model with log10 transformations

Figure 3.3: lin3: Simplest linear model with log10 transformations


3.1.4 pol1: log_light_avg ~ poly(log_pop_density, 2)

Since the data in Figure 2.4 looks like it has close to a quadratic relationship (a convex curve), let’s try a second order polynomial. Performance is quite good.

my_formula_basic <- "log_pop_density ~ log_light_avg"
my_formula <- "log_pop_density ~ poly(log_light_avg, 2)"

pol1_rec <- recipe(
  data = analysis(light_folds$splits[[1]]),
  formula =  as.formula(my_formula_basic)) %>%
  step_poly(log_light_avg, degree = 2)
  
pol1_wf <-
  workflow() %>%
  add_recipe(pol1_rec) %>%
  add_model(lm_model) # defined above

pol1_res <- pol1_wf %>% 
  fit_resamples(resamples = light_folds,
                metrics = mset,
                control = keep_pred)

pol1_metric_summary <- pol1_res %>%
  collect_metrics() %>%
  mutate(model = glue("pol1: {my_formula}"))  %>%
  dplyr::select(".metric", .estimate = "mean", "n", "std_err", "model")

metric_summary_for_plot <- glue("{pol1_metric_summary[[1, 1]]} = {round(pol1_metric_summary[[1, 2]], 3)}",
                       "; {pol1_metric_summary[[2, 1]]} = {round(pol1_metric_summary[[2, 2]], 3)}")

pol1_res %>% 
  collect_predictions() %>%
  ggplot(aes(x = log_pop_density, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  labs(title = glue("{pol1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
  )
pol1: Linear model with second order polynomial

Figure 3.4: pol1: Linear model with second order polynomial


3.1.5 lme1: multi-level model with lme4::lmer

Since states differ in the range and distribution of radiance values, are there improvements if I use a fixed effects model “log_pop_density ~ log_light_avg + (log_light_avg | state)?” Unfortunately not. There is no meaningful improvement over Figure 3.3. Apparently state doesn’t hold enough unique information to be helpful.

lme_model <- 
  linear_reg() %>%
  set_engine("lmer") %>%
  set_mode("regression")

my_formula <- "log_pop_density ~ log_light_avg + (log_light_avg | state)"

lme1_wf <-
  workflow() %>%
  add_variables(outcomes = log_pop_density,
                predictors = c(log_light_avg, state)
                ) %>%
  add_model(
    lme_model,
    formula = as.formula(my_formula)
    )

lme1_res <- lme1_wf %>% 
  fit_resamples(resamples = light_folds,
                metrics = mset,
                control = keep_pred)

lme1_metric_summary <- lme1_res %>%
  collect_metrics() %>%
  mutate(model = glue("lme1: {my_formula}")) %>%
  dplyr::select(".metric", .estimate = "mean", "n", "std_err", "model")

metric_summary_for_plot <- glue("{lme1_metric_summary[[1, 1]]} = {round(lme1_metric_summary[[1, 2]], 3)}",
                                "; {lme1_metric_summary[[2, 1]]} = {round(lme1_metric_summary[[2, 2]], 3)}")

lme1_res %>%
  collect_predictions() %>%
  ggplot(aes(x = log_pop_density, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  theme(plot.title = element_text(size = 14)) +
  labs(title = glue("{lme1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
  )
lme1: Multi-level model using 'state'

Figure 3.5: lme1: Multi-level model using ‘state’


3.1.6 bs1: linear model with b-spline

Since lin3 (figure 3.3) seems to be overestimating high and low radiance, let’s try a spline. First I tune the model to determine how many knots to include in the spline:

bs_rec <- recipe(log_pop_density ~ log_light_avg,
                  data = analysis(light_folds$splits[[1]])) %>%
  step_ns(log_light_avg, deg_free = tune("log_light_avg"))

bs_model <- 
  linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")

my_formula = "log_pop_density ~ bs(log_light_avg)"

bs1_wf <-
  workflow() %>%
  add_recipe(bs_rec) %>%
  add_model(bs_model) 

k_grid <- tibble(log_light_avg = 1:6)

bs_tune <- bs1_wf %>%
  tune_grid(resamples = light_folds,
            grid = k_grid,
            metrics = mset,
            control = grid_control)
autoplot(bs_tune) +
  labs(title = "Tuning bs1")
Tuning bs1

Figure 3.6: Tuning bs1


I’ll use 2 knots, which maximizes both ccc and rsq.

bs_wf_best <- bs1_wf %>%
  finalize_workflow(select_best(bs_tune, metric = "ccc"))

select_best(bs_tune, metric = "ccc") %>%
  gt()
log_light_avg .config
2 Preprocessor2_Model1


Both ccc and rsq are slightly better than lin3, and the high and low predictions are not consistently overestimated:

bs1_res <- bs_wf_best %>% 
  fit_resamples(resamples = light_folds,
                metrics = mset,
                control = keep_pred)

bs1_metric_summary <- bs1_res %>% collect_metrics() %>%
  mutate(model = glue("bs1: {my_formula}")) %>%
  dplyr::select(".metric", .estimate = "mean", "n", "std_err", "model")

metric_summary_for_plot <- glue("{bs1_metric_summary[[1, 1]]} = {round(bs1_metric_summary[[1, 2]], 3)}",
                                "; {bs1_metric_summary[[2, 1]]} = {round(bs1_metric_summary[[2, 2]], 3)}")

bs1_res %>%
  collect_predictions() %>%
  ggplot(aes(x = log_pop_density, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  theme(plot.title = element_text(size = 14)) +
  labs(title = glue("{bs1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
  )
bs1: Linear model with B-spline to capture curvature

Figure 3.7: bs1: Linear model with B-spline to capture curvature


3.1.7 XGBoost: log_pop_density ~ log_light_avg + state

Now a non-linear model and one that needs training: boosted trees using XGBoost. In this model state categories are dummy variables.

set.seed(2021)

xg1_rec <- recipe(log_pop_density ~ log_light_avg + state,
                  data = analysis(light_folds$splits[[1]])) %>%
  step_dummy(state)

xg1_wf <- workflow() %>%
  add_recipe(xg1_rec) %>%
  add_model(boost_tree(mode = "regression",
                       mtry = tune(),
                       trees = tune(),
                       # tree_depth = , # default is 6
                       learn_rate = .01) %>% set_engine("xgboost"))

xg1_tune <- xg1_wf %>%
  tune_grid(light_folds,
            grid = crossing(mtry = c(2, 5, 8, 11),
                       trees = seq(50, 400, 50),
                       #threshold = c(0.01) # default is 0.01
                       ),
            metrics = mset,
            control = grid_control)

Tuning the model: results suggest using between 250 and 400 trees with nearly equal performance including 5, 8 or 11 variables at each split:

autoplot(xg1_tune) +
  labs(title = "Tuning xg1")
Tuning xg1

Figure 3.8: Tuning xg1


The best fit includes 400 trees and using \(mtry = 8\) to randomly sample 8 variables at each split (which is nearly all of them in this case). The benefit of 400 trees versus 300 is very small, and given we have ~650 data points for training (before considering that we are cross-validating with 10 folds), I use the lesser number. Fewer trees reduces the chance of overfitting.

xg1_wf_best <- xg1_wf %>%
  finalize_workflow(select_best(xg1_tune, metric = "ccc")) %>%
  update_model(boost_tree(mode = "regression",
                       mtry = 8,
                       trees = 300,
                       # tree_depth = 3,
                       learn_rate = .01) %>% set_engine("xgboost"))


So I use these:

print(xg1_wf_best)
## ══ Workflow ═════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ─────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_dummy()
## 
## ── Model ────────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = 8
##   trees = 300
##   learn_rate = 0.01
## 
## Computational engine: xgboost


Summary metrics are good, however the model is underestimating most radiance values:

my_formula <- "log_pop_density ~ log_light_avg + state"

xg1_fit_best <- xg1_wf_best %>%
  fit(train_split)

xg1_res <- xg1_fit_best %>%
  last_fit(main_split, metrics = mset)

xg1_metric_summary <- xg1_res %>%
  collect_metrics() %>%
  mutate(model = glue("xg1: {my_formula}")) %>%
  mutate(n = n_folds,
         std_err = NA_real_) %>%
  dplyr::select(.metric, .estimate, n, std_err, model)

metric_summary_for_plot <- glue("{xg1_metric_summary[[2, 1]]} = {round(xg1_metric_summary[[2, 2]], 3)}",
                       "; {xg1_metric_summary[[1, 1]]} = {round(xg1_metric_summary[[1, 2]], 3)}")

xg1_res %>%
  collect_predictions() %>%
  ggplot(aes(x = log_pop_density, y = .pred)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  theme(plot.title = element_text(size = 14)) +
  labs(title = glue("{xg1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: {metric_summary_for_plot}")
  )
xg1: Random forest with boosted graidient descent

Figure 3.9: xg1: Random forest with boosted graidient descent


3.1.8 Summary of train-test results

As expected, the highest-performing model is not the simplest. lin1 and lin2 are the two worst-performing models; I exclude them from the next stage. The others (simpler and more complex) perform similarly.


all_metrics <- bind_rows(
  lin1_metric_summary,
  lin2_metric_summary,
  lin3_metric_summary,
  pol1_metric_summary,
  xg1_metric_summary,
  lme1_metric_summary,
  bs1_metric_summary,
  NULL
) %>%
  filter(.metric %in% c("ccc", "rsq"))

model_levels = all_metrics %>%
  filter(.metric == "ccc") %>%
  arrange(.estimate) %>%
  pull(model)

all_metrics %>%
  mutate(model = factor(model, levels = model_levels)) %>%
  ggplot(aes(.estimate, model)) +
  geom_errorbarh(aes(xmax = .estimate + std_err, xmin = .estimate - std_err),
                height = 0.2) +
  geom_point() +
  expand_limits(x = 1.0) +
  facet_wrap( ~ .metric, nrow = 1) +
  theme(plot.title.position = "plot") +
  labs(title = glue("Comparing model performance for metrics ",
                    "{glue_collapse(sort(unique(all_metrics$.metric)), sep = ', ')}"),
       subtitle = "Performance on train-test set",
       x = "mean metric estimate across all folds; error bars +/- std_err")
Summary of train-test results

Figure 3.10: Summary of train-test results


all_metrics %>%
  arrange(.metric, desc(.estimate)) %>%
  gt() %>%
  tab_header(
    title = md("**Model evaluation on train-test data set**")
    ) %>%
  fmt_number(columns = c(.estimate, std_err),
             decimals = 3) %>%
  fmt_missing(columns = everything(),
              missing_text = "---") %>%
  cols_align(columns = model, align = "left")
Model evaluation on train-test data set
.metric .estimate n std_err model
ccc 0.966 10 xg1: log_pop_density ~ log_light_avg + state
ccc 0.953 10 0.004 bs1: log_pop_density ~ bs(log_light_avg)
ccc 0.953 10 0.004 pol1: log_pop_density ~ poly(log_light_avg, 2)
ccc 0.948 10 0.004 lme1: log_pop_density ~ log_light_avg + (log_light_avg | state)
ccc 0.947 10 0.005 lin3: log_pop_density ~ log_light_avg
ccc 0.917 10 0.009 lin1: pop_density ~ light_avg
ccc 0.893 10 0.024 lin2: pop_density_official ~ light_avg
rsq 0.955 10 xg1: log_pop_density ~ log_light_avg + state
rsq 0.942 10 0.008 bs1: log_pop_density ~ bs(log_light_avg)
rsq 0.942 10 0.008 pol1: log_pop_density ~ poly(log_light_avg, 2)
rsq 0.931 10 0.008 lin3: log_pop_density ~ log_light_avg
rsq 0.931 10 0.008 lme1: log_pop_density ~ log_light_avg + (log_light_avg | state)
rsq 0.931 10 0.020 lin1: pop_density ~ light_avg
rsq 0.922 10 0.022 lin2: pop_density_official ~ light_avg


3.2 Assessing model performance with the holdout data set

In this section I use data the models haven’t seen yet from 2000 (the holdout data set).

lin3 summary metrics are quite good, however again the curvature in the data is visible in the way the model is overestimating high and low radiance values:

lin3_pred <- lin3_wf %>%
  predict_on_holdout(., train_split, holdout)

lin3_metric_summary <- bind_rows(
  rsq(lin3_pred, truth = log_pop_density, estimate = .pred),
  ccc(lin3_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(lin3_metric_summary$model))

lin3_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{lin3_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{lin3_metric_summary[[2, 1]]} = {round(lin3_metric_summary[[2, 3]], 3)}",
                       "; {lin3_metric_summary[[1, 1]]} = {round(lin3_metric_summary[[1, 3]], 3)}")
  )
lin3 using holdout set

Figure 3.11: lin3 using holdout set


pol1 looks good but is slightly overestimating high radiance values:

pol1_pred <- pol1_wf %>%
  predict_on_holdout(., train_split, holdout)

pol1_metric_summary <- bind_rows(
  rsq(lin3_pred, truth = log_pop_density, estimate = .pred),
  ccc(lin3_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(pol1_metric_summary$model))

pol1_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{pol1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{pol1_metric_summary[[2, 1]]} = {round(pol1_metric_summary[[2, 3]], 3)}",
                       "; {pol1_metric_summary[[1, 1]]} = {round(pol1_metric_summary[[1, 3]], 3)}")
  )
pol1 using holdout set

Figure 3.12: pol1 using holdout set


Again lme1 fails to do better than lin3:

# error=TRUE to continue execution in case model fails to converge

lme1_pred <- lme1_wf %>%
  predict_on_holdout(., train_split, holdout)

lme1_metric_summary <- bind_rows(
  rsq(lme1_pred, truth = log_pop_density, estimate = .pred),
  ccc(lme1_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(lme1_metric_summary$model))

lme1_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{lme1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{lme1_metric_summary[[2, 1]]} = {round(lme1_metric_summary[[2, 3]], 3)}",
                       "; {lme1_metric_summary[[1, 1]]} = {round(lme1_metric_summary[[1, 3]], 3)}")
  )
lme1 using holdout set

Figure 3.13: lme1 using holdout set


Again bs1 is performing quite well:

bs1_pred <- bs_wf_best %>%
  predict_on_holdout(., train_split, holdout)

bs1_metric_summary <- bind_rows(
  rsq(bs1_pred, truth = log_pop_density, estimate = .pred),
  ccc(bs1_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(bs1_metric_summary$model))

bs1_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{bs1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{bs1_metric_summary[[2, 1]]} = {round(bs1_metric_summary[[2, 3]], 3)}",
                       "; {bs1_metric_summary[[1, 1]]} = {round(bs1_metric_summary[[1, 3]], 3)}")
  )
bs1 using holdout set

Figure 3.14: bs1 using holdout set


xg1 performance metrics are good, but the model is again systematically underestimating:

xg1_pred <- xg1_fit_best %>%
  predict_on_holdout(., train_split, holdout)

xg1_metric_summary <- bind_rows(
  rsq(xg1_pred, truth = log_pop_density, estimate = .pred),
  ccc(xg1_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(xg1_metric_summary$model))

xg1_pred %>%
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{xg1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{xg1_metric_summary[[2, 1]]} = {round(xg1_metric_summary[[2, 3]], 3)}",
                       "; {xg1_metric_summary[[1, 1]]} = {round(xg1_metric_summary[[1, 3]], 3)}")
  )
xg1 using holdout set

Figure 3.15: xg1 using holdout set


3.2.1 Summary: model evaluation using holdout data set

Results are similar for all models. The very simple lin3 is very competitive. I exclude lme1 from the next stage.

all_metrics <- bind_rows(
  lin3_metric_summary,
  pol1_metric_summary,
  lme1_metric_summary,
  bs1_metric_summary,
  xg1_metric_summary
)

model_levels = all_metrics %>%
  filter(.metric == "ccc") %>%
  arrange(.estimate) %>%
  pull(model)

all_metrics %>%
  mutate(model = factor(model, levels = model_levels)) %>%
  ggplot(aes(.estimate, model)) +
  geom_point() +
  geom_label(aes(label = str_pad(round(.estimate, 3), 
                                 width = 5, side = "right", pad = "0")
                 ),
             size = 3, nudge_x = -0.1, hjust = 1) +
  expand_limits(x = 0) +
  facet_wrap(~ .metric, scales = "free_x") +
  theme(legend.position = "none",
        plot.title.position = "plot") +
  labs(title = "Model evaluation on holdout data set",
       x = "metric estimate",
       y = "")
Summary of holdout results

Figure 3.16: Summary of holdout results


all_metrics %>%
  dplyr::select(-.estimator) %>%
  arrange(.metric, desc(.estimate)) %>%
  gt() %>%
  tab_header(
  title = md("**Model evaluation on holdout data set**")
  ) %>%
  fmt_number(columns = .estimate,
             decimals = 3)
Model evaluation on holdout data set
.metric .estimate model
ccc 0.972 bs1: log_pop_density ~ bs(log_light_avg)
ccc 0.966 xg1: log_pop_density ~ log_light_avg + state
ccc 0.963 lin3: log_pop_density ~ log_light_avg
ccc 0.963 pol1: log_pop_density ~ poly(log_light_avg, 2)
ccc 0.963 lme1: log_pop_density ~ log_light_avg + (log_light_avg | state)
rsq 0.955 xg1: log_pop_density ~ log_light_avg + state
rsq 0.954 bs1: log_pop_density ~ bs(log_light_avg)
rsq 0.937 lme1: log_pop_density ~ log_light_avg + (log_light_avg | state)
rsq 0.937 lin3: log_pop_density ~ log_light_avg
rsq 0.937 pol1: log_pop_density ~ poly(log_light_avg, 2)


3.3 Comparing 2000 and 2010

I use the four remaining models to predict changes in log_pop_density in 2010 based on average county radiance log_light_avg in the same year, comparing performance on the holdout set with the 2010 assessment set.

lin3 looks pretty good, but the model overestimates high 2010 log_pop_density values.

lin3_pred <- lin3_wf %>%
  predict_on_holdout(., holdout, assess_df)

lin3_metric_summary <- bind_rows(
  rsq(lin3_pred, truth = log_pop_density, estimate = .pred),
  ccc(lin3_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(lin3_metric_summary$model))

lin3_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{lin3_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{lin3_metric_summary[[2, 1]]} = {round(lin3_metric_summary[[2, 3]], 3)}",
                       "; {lin3_metric_summary[[1, 1]]} = {round(lin3_metric_summary[[1, 3]], 3)}")
  )
lin3 2010 assess set results

Figure 3.17: lin3 2010 assess set results


pol1 is underestimating low 2010 log_pop_density values and overestimating most other values:

pol1_pred <- pol1_wf %>%
  predict_on_holdout(., holdout, assess_df)

pol1_metric_summary <- bind_rows(
  rsq(pol1_pred, truth = log_pop_density, estimate = .pred),
  ccc(pol1_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(pol1_metric_summary$model))

pol1_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{pol1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{pol1_metric_summary[[2, 1]]} = {round(pol1_metric_summary[[2, 3]], 3)}",
                       "; {pol1_metric_summary[[1, 1]]} = {round(pol1_metric_summary[[1, 3]], 3)}")
  )
pol1 2010 assess set results

Figure 3.18: pol1 2010 assess set results


Like pol1, bs1 is underestimating low 2010 log_pop_density values and overestimating most other values:

bs1_pred <- bs_wf_best %>%
  predict_on_holdout(., holdout, assess_df)

bs1_metric_summary <- bind_rows(
  rsq(bs1_pred, truth = log_pop_density, estimate = .pred),
  ccc(bs1_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(bs1_metric_summary$model))

bs1_pred %>% 
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{bs1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{bs1_metric_summary[[2, 1]]} = {round(bs1_metric_summary[[2, 3]], 3)}",
                       "; {bs1_metric_summary[[1, 1]]} = {round(bs1_metric_summary[[1, 3]], 3)}")
       
  )
bs1 2010 assess set results

Figure 3.19: bs1 2010 assess set results


xg1 is underestimating low and high log_pop_density values:

xg1_pred <- xg1_fit_best %>%
  predict_on_holdout(., holdout, assess_df)

xg1_metric_summary <- bind_rows(
  rsq(xg1_pred, truth = log_pop_density, estimate = .pred),
  ccc(xg1_pred, truth = log_pop_density, estimate = .pred)
) %>%
  mutate(model = unique(xg1_metric_summary$model))

xg1_pred %>%
  ggplot(aes(log_pop_density, .pred, color = state)) +
  geom_abline(lty = 2, alpha = 0.5) +
  geom_point(alpha = 0.15) +
  coord_fixed() +
  labs(title = glue("{xg1_metric_summary$model}"),
       subtitle = glue("Actual and predicted log_pop_density: ", 
                       "{xg1_metric_summary[[2, 1]]} = {round(xg1_metric_summary[[2, 3]], 3)}",
                       "; {xg1_metric_summary[[1, 1]]} = {round(xg1_metric_summary[[1, 3]], 3)}")
  )
xg1 2010 assess set results

Figure 3.20: xg1 2010 assess set results


3.3.1 Summary: model performance on census 2010 data.

The simplest model lin3 perform slightly better, and our next-simplest odes. pol1, and bs1 are essentially equal. xg1 does worse at high and low log_pop_density values. This is a sign that xg1 may be overfitted on the training data. Even so, the difference in performance among the models are quite small.

Looking the absolute value of difference in means between predicted and true log_pop_density, I see all four models are performing similarly:

all_means <- bind_rows(
  with(lin3_pred, t.test(.pred, log_pop_density, paired = TRUE)) %>% tidy() %>% 
    mutate(model = unique(lin3_metric_summary$model)),
  with(xg1_pred, t.test(.pred, log_pop_density, paired = TRUE)) %>% tidy() %>% 
    mutate(model = unique(xg1_metric_summary$model)),
  with(pol1_pred, t.test(.pred, log_pop_density, paired = TRUE)) %>% tidy() %>% 
    mutate(model = unique(pol1_metric_summary$model)),
  with(bs1_pred, t.test(.pred, log_pop_density, paired = TRUE)) %>% tidy() %>% 
    mutate(model = unique(bs1_metric_summary$model))
) %>%
  rename(diff_in_means = estimate) %>%
  arrange(desc(abs(diff_in_means))) %>%
  mutate(model = as_factor(model)) # creates levels with current ordering

all_means %>%
  ggplot(aes(abs(diff_in_means), model)) +
  geom_errorbarh(aes(xmin = abs(conf.low), xmax = abs(conf.high), height = 0.1)) +
  geom_point() +
  theme(legend.position = "none",
        plot.title.position = "plot") +
  labs(title = "Difference in means between predicted and\nactual log_pop_density (2010 data)",
       subtitle = "T-test at 95% CI.",
       x = "difference in means",
       y = "")
Differences in the means (predicted v actual)

Figure 3.21: Differences in the means (predicted v actual)


all_metrics <- bind_rows(
  lin3_metric_summary,
  pol1_metric_summary,
  xg1_metric_summary,
  bs1_metric_summary
)

model_levels = all_metrics %>%
  filter(.metric == "ccc") %>%
  arrange(.estimate) %>%
  pull(model)

all_metrics %>%
  mutate(model = factor(model, levels = model_levels)) %>%
  ggplot(aes(.estimate, model)) +
  geom_point() +
  geom_label(aes(label = str_pad(round(.estimate, 3), 
                                 width = 5, side = "right", pad = "0")
                 ),
             size = 3, nudge_x = -0.1, hjust = 1) +
  expand_limits(x = 0) +
  facet_wrap(~ .metric, scales = "free_x") +
  theme(legend.position = "none",
        plot.title.position = "plot") +
  labs(title = "Model evaluation on 2010 data",
       x = "metric estimate",
       y = "")
Holdout set results

Figure 3.22: Holdout set results

all_metrics %>%
  dplyr::select(-.estimator) %>%
  arrange(.metric, desc(.estimate)) %>%
  gt() %>%
  tab_header(
  title = md("**Model evaluation on 2010 data**")
  ) %>%
  fmt_number(columns = c(.estimate),
             decimals = 3)
Model evaluation on 2010 data
.metric .estimate model
ccc 0.967 lin3: log_pop_density ~ log_light_avg
ccc 0.957 pol1: log_pop_density ~ poly(log_light_avg, 2)
ccc 0.957 bs1: log_pop_density ~ bs(log_light_avg)
ccc 0.953 xg1: log_pop_density ~ log_light_avg + state
rsq 0.945 lin3: log_pop_density ~ log_light_avg
rsq 0.927 bs1: log_pop_density ~ bs(log_light_avg)
rsq 0.926 pol1: log_pop_density ~ poly(log_light_avg, 2)
rsq 0.915 xg1: log_pop_density ~ log_light_avg + state