6 Numbers

Show the code

###### Get state populations
# v2020_pl <- load_variables(2020, "pl", cache = TRUE)
# v2020_dp <- load_variables(2020, "dp", cache = TRUE)
# v2020_dhc <- load_variables(2020, "dhc", cache = TRUE)

pop2020_raw <- get_decennial(
  geography = "state",
  variables = "DP1_0001C", # "P001001",
  year = 2020,
  sumfile = "dp",
  geometry = TRUE,
  cb = TRUE,
  resolution = "20m", # cartographic boundaries simplified
  cache_table = TRUE
) |>
  clean_names()

pop2020 <- pop2020_raw |>
  filter(name != "Puerto Rico") |>
  filter(!name %in% c("Alaska", "Hawaii", "District of Columbia")) |>
  rename(pop = value) |>
  mutate(
    area = st_area(geometry),
    area_sqmi = as.numeric(area) / 2589988.110336 #m^2 to mi^2
  ) |>
  left_join(us_state_data, by = join_by(name == state)) |>
  mutate(area_diff_pct = area_sqmi / state_area) |>
  filter(name != "District of Columbia")

# continental_us_border <- st_union(pop2020)

# bbox_continental <- continental_us_border |>
#   st_bbox()

dams_by_state_with_pop <- dams_by_state |>
  left_join(
    pop2020 |>
      st_drop_geometry(),
    by = join_by(state == name, state_abb == state_abb)
  ) |>
  left_join(wet, by = join_by(state == usgs_state)) |>
  mutate(
    #area_diff_pct2 = state_area / usgs_total_area_square_miles, # R value / USGS value
    #area_diff_pct3 = area_sqmi / usgs_total_area_square_miles, # calculated area / USGS value
    area_diff_pct4 = state_area / usgs_total_area_less_great_lakes, # R value / USGS value
    area_diff_pct5 = area_sqmi / usgs_total_area_less_great_lakes # calculated area / USGS value
  )

###### prepare-ratio-data

dams_with_pop <- dams_no_geo |>
  st_drop_geometry() |>
  select(
    name,
    nidId,
    state,
    nidStorage,
    normalStorage,
    nidHeight,
    damLength,
    surfaceArea,
    drainageArea,
    maxDischarge,
    big_dam
  ) |>
  left_join(pop2020, by = join_by(state == name)) |>
  left_join(wet, by = join_by(state == usgs_state))

dams_ratios_per_state <- dams_with_pop |>
  filter(nidId != "MI00650") |> # Correct for Soo Locks in Michigan
  # NM average ratio is way out of proportion. These two have obvious errors in surfaceArea values
  filter(
    !nidId %in%
      c(
        "NM00039", # Encino Det Dam 083
        "NM00047"
      ) # Alamos Detention Dam
  ) |>
  reframe(
    n_dams = n(),
    sqmi_per_dam = first(area_sqmi) / n_dams,
    pct_surface_area_dams = sum(surfaceArea, na.rm = TRUE) /
      640 /
      first(area_sqmi), # 640 acres/sqmi
    # pct_of_water_area = sum(surfaceArea, na.rm = TRUE) / 640 / first(usgs_water_area_square_miles),
    pop_per_dam = first(pop) / n_dams,
    median_surface_area_to_nidStorage = median(
      surfaceArea / nidStorage,
      na.rm = TRUE
    ),
    avg_surface_area_to_nidStorage = mean(
      surfaceArea / nidStorage,
      na.rm = TRUE
    ),
    .by = state
  ) |>
  left_join(
    us_state_data,
    by = "state"
  ) |>
  mutate(
    state = fct_reorder(state, n_dams),
    state_abb = fct_reorder(state_abb, n_dams)
  )

dams_ratios_per_state_big <- dams_with_pop |>
  filter(nidId != "MI00650") |> # Correct for Soo Locks in Michigan
  filter(big_dam) |>
  reframe(
    n_dams = n(),
    sqmi_per_dam = first(area_sqmi) / n_dams,
    pct_surface_area_dams = sum(surfaceArea, na.rm = TRUE) /
      640 /
      first(area_sqmi), # 640 acres/sqmi
    # pct_of_water_area = sum(surfaceArea, na.rm = TRUE) / 640 / first(usgs_water_area_square_miles),
    pop_per_dam = first(pop) / n_dams,
    median_surface_area_to_nidStorage = median(
      surfaceArea / nidStorage,
      na.rm = TRUE
    ),
    .by = state
  ) |>
  mutate(state = fct_reorder(state, n_dams))

dams_ratios_per_state_distribution <- dams_with_pop |>
  filter(nidId != "MI00650") |> # Correct for Soo Locks in Michigan
  reframe(
    name = name,
    nidId = nidId,
    surface_area_to_nidStorage = surfaceArea / nidStorage,
    surfaceArea = surfaceArea,
    normalStorage = normalStorage,
    nidStorage = nidStorage,
    state = state
    # .by = state
  )

6.1 Volume of water under management

Perhaps the most important metric for dams is the amount of water¹ stored by the dam. I think of this as “water under management”. It’s interesting to look at this by primary owner and primary purpose (Figure 6.1), and by state (Figure 6.2).

Show the code

dta_for_storage <- dams_no_geo |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, state, state_abb, nidStorage, yearCompleted, yearCompletedId) |>
  filter(!is.na(nidStorage))

storage_by_primary_purpose <- dta_for_storage |>
  filter(nidId != "MI00650") |>
  mutate(primaryOwnerTypeId = if_else(primaryOwnerTypeId == "Tribal Government",
                                      "Other or not specified",
                                      primaryOwnerTypeId)) |>
  count(primaryPurposeId, primaryOwnerTypeId, wt = nidStorage) |>
  mutate(primaryPurposeId = fct_reorder(primaryPurposeId, n, sum),
         primaryOwnerTypeId = fct_reorder(primaryOwnerTypeId, n, sum))

storage_by_primary_owner <- dta_for_storage |>
  filter(nidId != "MI00650") |>
  mutate(primaryOwnerTypeId = if_else(primaryOwnerTypeId == "Tribal Government",
                                      "Other or not specified",
                                      primaryOwnerTypeId)) |>
  count(primaryOwnerTypeId, wt = nidStorage) |>
  mutate(primaryOwnerTypeId = fct_reorder(primaryOwnerTypeId, n, sum))

storage_by_state <- dta_for_storage |>
  count(state, state_abb, wt = nidStorage) |>
  mutate(state = fct_reorder(state, n),
         state_abb = fct_reorder(state_abb, n)
         )

storage_by_state_mi_corrected <- dta_for_storage |>
  filter(nidId != "MI00650") |>
  count(state, state_abb, wt = nidStorage) |>
  mutate(state = fct_reorder(state, n),
         state_abb = fct_reorder(state_abb, n)
         )

Show the code

p1 <- storage_by_primary_purpose |>
  filter(primaryOwnerTypeId != "Tribal Government") |>
  ggplot(aes(n, primaryPurposeId)) +
  geom_col(aes(fill = primaryOwnerTypeId)) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.04))) +
  scale_fill_viridis_d(end = 0.9) +
  guides(fill = guide_legend(position = "inside",
                             reverse =  TRUE)) +
  theme(legend.position.inside = c(0.6, 0.4)) +
  labs(
    subtitle = "A: Primary purpose",
    x = "acre ft",
    y = NULL,
    fill = "Primary owner"
  )

p2 <- storage_by_primary_owner |>
  ggplot(aes(n, primaryOwnerTypeId)) +
  geom_col() + #aes(fill = primaryPurposeId)) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.04))) +
  labs(
    subtitle = "B: Primary owner",
    x = "acre ft",
    y = NULL
  )

p1 + p2 +
  plot_annotation(
    title = "Primary purpose and primary owner\nby amount of water under management (nidStorage)",
    subtitle = "Continental USA. Michigan corrected",
    caption = my_caption
  )

Figure 6.1: `primaryPurpose` and `primaryOwner` by volume of water being managed (`nidStorage`)

The amount of water under management by state needs a correction before we can use it. In Figure 6.2 panel A, Michigan is shown as having so much more water under management than other states because of one structure: Soo Locks in Sault Ste. Marie (nidID MI00650), through which ships travel between Lake Superior and the other Great Lakes. Because of Soo Locks, Michigan “claims” nearly 278M acre-feet of water in Lake Superior. Removing this one structure in panel B moves Michigan down the ranking to 42 of 48 states. Where I’m not looking at nidStorage, I leave Soo Locks in the data sets. Where I remove it, the plot includes the note: “Michigan corrected”.

For other state-level data see Section 6.7 below.

Show the code

n_states <- nrow(storage_by_state)
p1 <- storage_by_state |>
  ggplot(aes(n, state)) +
  geom_col() +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.04))) +
  labs(
    subtitle = glue("A: {n_states} states"),
    x = "acre ft",
    y = NULL
  )

p2 <- storage_by_state_mi_corrected |>
  ggplot(aes(n, state)) +
  geom_col() +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.04))) +
  labs(
    subtitle = glue("B: {n_states} states (Michigan corrected)"),
    x = "acre ft",
    y = NULL
  )

p1 + p2 +
  plot_annotation(
    title = "State by amount of water under management (nidStorage)",
    subtitle = "Continental USA",caption = my_caption
  )

Figure 6.2: Volume of water under management by state (nidStorage)

6.2 Other primary dam metrics

Primary dam metrics are log-normal-ish and vary over nearly seven orders of magnitude: damHeight, damLength, maxDischarge, nidStorage, surfaceArea, and drainageArea (Figure 6.3). The latter seems like it might be a composite of two log-normal distributions. Indeed, big dams seem to be supplying a distinct distribution (Figure 6.4).

Show the code

dta_for_plot <- dams |>
  st_drop_geometry() |>
  select(name, state, big_dam, nidId, nidStorage, nidHeight, damLength, surfaceArea, drainageArea, maxDischarge) |>
  filter(nidId != "MI00650") |>
  filter(damLength > 10)

n_dams_allft <- nrow(dta_for_plot)

dta_for_plot_long <- dta_for_plot |>
  pivot_longer(
    cols = nidStorage:maxDischarge,
    names_to = "metric",
    values_to = "value"
  ) |>
  filter(value > 0) |>
  mutate(metric_label = case_match(
    metric,
    "damLength"        ~ "dam length (ft)",
    "nidHeight"        ~ "dam height (ft)",
    "drainageArea"     ~ "drainage area (mi^2)",
    "maxDischarge"     ~ "max discharge (ft^3/sec)",
    "surfaceArea"      ~ "surface area (acre)",
    "nidStorage"       ~ "storage (acre-feet)",
    .default = glue("error: {metric}")
  ))

dta_for_plot_medians = dta_for_plot_long |>
  reframe(value = median(value),
          n_dams = n(),
          label_y = 0.3 * max(value, na.rm = TRUE),
            .by = metric_label)

dta_for_plot_long |>
  ggplot() +
  geom_histogram(aes(value),
                 bins = 30,
                 na.rm = TRUE) +
  geom_vline(data = dta_for_plot_medians,
             aes(xintercept = value),
             lty = 2,
             linewidth = 0.25,
             alpha = 0.5,
             na.rm = TRUE) +
  geom_label(data = dta_for_plot_medians,
            aes(x = Inf, y = Inf, label = glue("n = {n_dams}\nmedian = {round(value, digits = 1)}")),
            hjust = 1, vjust = 1,
            size = 3,
            na.rm = TRUE
            ) +
  scale_x_log10(label = label_number(scale_cut = cut_short_scale()),
                expand = expansion(mult = c(0, 0.04))) +
  facet_wrap(~metric_label, scales = "free_x") +
  labs(
    title = "Primary dam metrics ",
    subtitle = glue("Includes the portion of the {n_dams_allft} dams that are not missing values, which varies by metric",
                    "\nContinental USA. Michigan corrected."),
    x = NULL,
    y = "Number of dams",
    caption = my_caption
  )

Figure 6.3: Histograms of primary dam metrics

For big dams, damHeight and maxDischarge are one end of the distribution. The other big dam metric distributions are log-normal-ish.

Show the code

dta_for_plot <- dams |>
  st_drop_geometry() |>
  select(name, state, big_dam, nidId, nidStorage, nidHeight, damLength, surfaceArea, drainageArea, maxDischarge) |>
  filter(big_dam) |>
  filter(nidId != "MI00650") |>
  filter(damLength > 10)

dta_for_plot_long <- dta_for_plot |>
  pivot_longer(
    cols = nidStorage:maxDischarge,
    names_to = "metric",
    values_to = "value"
  ) |>
  filter(value > 0) |>
  mutate(metric_label = case_match(
    metric,
    "damLength"    ~ "dam length (ft)",
    "nidHeight"    ~ "dam height (ft)",
    "drainageArea" ~ "drainage area (mi^2)",
    "maxDischarge" ~ "max discharge (ft^3/sec)",
    "surfaceArea"  ~ "surface area (acre)",
    "nidStorage"   ~ "storage (acre-feet)",
    .default = glue("error: {metric}")
  ))

dta_for_plot_medians = dta_for_plot_long |>
  reframe(value = median(value),
          n_dams = n(),
          .by = metric_label)

n_dams_big <- nrow(dta_for_plot)

dta_for_plot_long |>
  ggplot() +
  geom_histogram(aes(value),
                 bins = 30) +
  geom_vline(data = dta_for_plot_medians,
             aes(xintercept = value),
             lty = 2,
             linewidth = 0.25,
             alpha = 0.5) +
  geom_label(data = dta_for_plot_medians,
            aes(x = Inf, y = Inf, label = glue("n = {n_dams}\nmedian = {round(value, digits = 1)}")),
            hjust = 1, vjust = 1,
            size = 3) +
  scale_x_log10(label = label_number(scale_cut = cut_short_scale())) +
  facet_wrap(~metric_label, scales = "free_x") +
  labs(
    title = "Big dams: primary dam metrics",
    subtitle = glue("Includes the portion of the {n_dams_big} dams that are not missing values, which varies by metric",
                    "\nContinental USA. Michigan corrected."),
    x = NULL,
    y = "Number of dams",
    caption = my_caption
  )

Figure 6.4: Primary dam metrics for big dams

6.3 Counts

A plurality of dams are privately owned, for recreation, and made of earth (Figure 9.5).

Show the code

p1 <- dams_no_geo |>
  count(primaryOwnerTypeId) |>
  mutate(primaryOwnerTypeId = fct_reorder(primaryOwnerTypeId, n),
         n_pct = `n` / sum(`n`)) |>
  ggplot(aes(n, primaryOwnerTypeId)) +
  geom_col() +
  geom_text(aes(x = 0.9 * max(n), label = glue("{n} | {percent(n_pct, accuracy = 1)}")),
            hjust = 1,
            size = 3) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale())) +
  labs(
    subtitle = "A: Primary owner type",
    x = NULL,
    y = NULL
  )

p2 <- dams_no_geo |>
  count(primaryPurposeId) |>
  mutate(primaryPurposeId = fct_reorder(primaryPurposeId, n),
         n_pct = `n` / sum(`n`)) |>
  ggplot(aes(n, primaryPurposeId)) +
  geom_col() +
  geom_text(aes(x = 0.9 * max(n), label = glue("{n} | {percent(n_pct, accuracy = 1)}")),
            hjust = 1,
            size = 3) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale())) +
  labs(
    subtitle = "B: Primary purpose",
    x = NULL,
    y = NULL
  )

p3 <- dams_no_geo |>
  count(primaryDamTypeId) |>
  mutate(primaryDamTypeId = fct_reorder(primaryDamTypeId, n),
         n_pct = `n` / sum(`n`)) |>
  ggplot(aes(n, primaryDamTypeId)) +
  geom_col() + 
  geom_text(aes(x = 0.9 * max(n), label = glue("{n} | {percent(n_pct, accuracy = 1)}")),
            hjust = 1,
            size = 3) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale())) +
  labs(
    subtitle = "C: Primary dam type",
    x = NULL,
    y = NULL
  )

p1 + p2 + p3 +
  plot_annotation(
    title = "Basic counts: owner, purpose, dam type",
    caption = my_caption
  ) & theme(plot.title.position = "plot")

Figure 6.5: Basic counts: owner, purpose, dam type

Big dam proportions differ from the set of all dams (Figure 6.6) in the following ways:

A smaller proportion of big dams are privately owned (only 37% compared to 64%)
Recreation is not a major purpose. Instead flood risk control, water supply, and hydroelectric dominate the purposes (combined they are 60%)
There is more diversity in the dam type, since most small dams are made of earth

Show the code

p1 <- dams_no_geo |>
  filter(big_dam) |>
  count(primaryOwnerTypeId) |>
  mutate(primaryOwnerTypeId = fct_reorder(primaryOwnerTypeId, n),
         n_pct = `n` / sum(`n`)) |>
  ggplot(aes(n, primaryOwnerTypeId)) +
  geom_col() +
  geom_text(aes(x = 0.9 * max(n), label = glue("{n} | {percent(n_pct, accuracy = 1)}")),
            hjust = 1,
            size = 3) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale())) +
  labs(
    subtitle = "A: Primary owner type",
    x = NULL,
    y = NULL
  )

p2 <- dams_no_geo |>
  filter(big_dam) |>
  count(primaryPurposeId) |>
  mutate(primaryPurposeId = fct_reorder(primaryPurposeId, n),
         n_pct = `n` / sum(`n`)) |>
  ggplot(aes(n, primaryPurposeId)) +
  geom_col() +
  geom_text(aes(x = 0.9 * max(n), label = glue("{n} | {percent(n_pct, accuracy = 1)}")),
            hjust = 1,
            size = 3) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale())) +
  labs(
    subtitle = "B: Primary purpose",
    x = NULL,
    y = NULL
  )

p3 <- dams_no_geo |>
  filter(big_dam) |>
  count(primaryDamTypeId) |>
  mutate(primaryDamTypeId = fct_reorder(primaryDamTypeId, n),
         n_pct = `n` / sum(`n`)) |>
  ggplot(aes(n, primaryDamTypeId)) +
  geom_col() + 
  geom_text(aes(x = 0.9 * max(n), label = glue("{n} | {percent(n_pct, accuracy = 1)}")),
            hjust = 1,
            size = 3) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale())) +
  labs(
    subtitle = "C: Primary dam type",
    x = NULL,
    y = NULL
  )

p1 + p2 + p3 +
  plot_annotation(
    title = "Big dam basic counts: owner, purpose, dam type",
    subtitle = "Includes dams with height > 100 ft",
    caption = my_caption
  ) & theme(plot.title.position = "plot")

Figure 6.6: Big dam basic counts: owner, purpose, dam type

A matrix of primaryPurpose and primaryOwner make visible the most common combinations (Figure 6.7).

Show the code

owner_purpose <- dams |>
  st_drop_geometry() |>
  count(primaryOwnerTypeId, primaryPurposeId) |>
  complete(primaryOwnerTypeId, primaryPurposeId, fill = list(n = 0)) |>
  mutate(primaryOwnerTypeId = fct_reorder(primaryOwnerTypeId, -n, sum),
         primaryPurposeId = fct_reorder(primaryPurposeId, n, sum))

owner_purpose |>
  ggplot(aes(x = primaryOwnerTypeId, y = primaryPurposeId)) +
  geom_tile(aes(fill = n),
            alpha = 0.98) +
  geom_text(aes(label = n),
            color = "white",
             size = 3) +
  theme(plot.title.position = "plot",
        axis.text.x = element_text(angle = 60, hjust = 0)
        ) +
  scale_x_discrete(position = "top") +
  scale_fill_continuous(transform = scales::transform_log1p(),
                        breaks = c(0, 10, 100, 1000, 10000)) +
  labs(
    title = "Primary purpose by primary owner type - count",
    x = "Primary owner",
    y = "Primary purpose",
    caption = my_caption
  )

Figure 6.7: Primary purpose by primary owner type - count

The top ten combinations are shown below (Table 6.1):

Show the code

owner_purpose |>
  slice_max(order_by = n, n = 10) |>
  mutate(idx = row_number()) |>
  gt() |>
  tab_options(table.font.size = 11) |>
  tab_header(md("**Top 10 combinations of primary owner type and primary purpose**")) |>
  cols_align(columns = c(starts_with("primary")),
             align = "left") |>
  fmt_number(columns = n,
             decimals = 0)

Table 6.1

Top 10 combinations of primary owner type and primary purpose
primaryOwnerTypeId	primaryPurposeId	n	idx
Private	Recreation	23,975	1
Local Government	Flood Risk Reduction	8,734	2
Private	Fire Protection, Stock, Or Small Fish Pond	8,136	3
Private	Other or not specified	7,982	4
Private	Irrigation	6,356	5
Local Government	Recreation	3,088	6
State	Flood Risk Reduction	2,730	7
Private	Flood Risk Reduction	2,409	8
State	Recreation	2,322	9
Local Government	Water Supply	2,168	10

Large dams, by contrast, are mostly federally owned and for flood reduction, whereas local governments’ dams are mostly for water supply, and public utility-owned dams are mostly for electricity generation (Figure 6.8).

Show the code

owner_purpose_big <- dams_no_geo |>
  filter(big_dam) |> # only "big" dams
  count(primaryOwnerTypeId, primaryPurposeId) |>
  complete(primaryOwnerTypeId, primaryPurposeId, fill = list(n = 0)) |>
  mutate(primaryOwnerTypeId = fct_reorder(primaryOwnerTypeId, -n, sum),
         primaryPurposeId = fct_reorder(primaryPurposeId, n, sum))

owner_purpose_big |>
  ggplot(aes(x = primaryOwnerTypeId, y = primaryPurposeId)) +
  geom_tile(aes(fill = n),
            alpha = 0.98) +
  geom_text(aes(label = n),
            color = "white",
             size = 3) +
  theme(plot.title.position = "plot",
        axis.text.x = element_text(angle = 60, hjust = 0)
        ) +
  scale_x_discrete(position = "top") +
  scale_fill_continuous(transform = scales::transform_log1p(),
                        breaks = c(0, 10, 100, 1000, 10000)) +
  labs(
    title = "Big dams: primary purpose by primary owner type - count",
    subtitle = "'Big' dam defined as having height >= 100 ft",
    x = "Primary owner",
    y = "Primary purpose",
    caption = my_caption
  )

Figure 6.8: Primary purpose by primary owner type - count

The most common combinations for big dams are shown below:

Show the code

owner_purpose_big |>
  slice_max(order_by = n, n = 10) |>
  mutate(idx = row_number()) |>
  gt() |>
  tab_options(table.font.size = 11) |>
  tab_header(md("**Big dams: top 10 combinations of primary owner type and primary purpose**<br>Dams with height > 100 ft")) |>
  cols_align(columns = c(starts_with("primary")),
             align = "left") |>
  fmt_number(columns = n,
             decimals = 0)

Table 6.2

Big dams: top 10 combinations of primary owner type and primary purpose Dams with height > 100 ft
primaryOwnerTypeId	primaryPurposeId	n	idx
Federal	Flood Risk Reduction	292	1
Private	Tailings	229	2
Local Government	Water Supply	140	3
Private	Hydroelectric	94	4
Public Utility	Hydroelectric	93	5
Public Utility	Water Supply	75	6
Private	Other or not specified	73	7
Federal	Irrigation	59	8
Local Government	Flood Risk Reduction	52	9
Federal	Hydroelectric	49	10

Alluvial plots provide a bird’s eye view of counts of multiple properties. I go deeper into North Carolina dams in Chapter 7 Owners in NC exclusively using this kind of plot.

Show the code

# following example at https://mjfrigaard.github.io/fm-ggp2/alluvial_charts.html

dams_wide <- dams |>
  st_drop_geometry() |>
  # filter(nidHeightId == "Greater than 100 ft") |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId) |>
  filter(primaryOwnerTypeId != "Other or not specified",
         primaryPurposeId != "Other or not specified") |>
  mutate(primaryPurposeId = fct_lump_n(primaryPurposeId, 9)) |>
  count(primaryOwnerTypeId, primaryPurposeId) |>
  mutate(primaryOwnerTypeId = fct_reorder(primaryOwnerTypeId, n, sum),
         primaryPurposeId = fct_reorder(primaryPurposeId, n, sum)
  )

p1 <- dams_wide |>
  ggplot(aes(axis1 = primaryOwnerTypeId, 
             axis2 = primaryPurposeId,
             y = n)) +
  scale_x_discrete(
    limits = c("Owner", "Purpose"),
    expand = c(0.1, 0.07),
    position = "top") +
  geom_alluvium(aes(fill = primaryPurposeId),
                show.legend = FALSE) +
  geom_stratum(linewidth = 0.05,
               alpha = 0.8) +
  geom_text(stat = "stratum",
    aes(label = after_stat(stratum)),
      size = 3) +
  scale_fill_brewer(palette = "Paired") +
  theme(axis.text.x = element_text(size = rel(2))) +
  labs(
    subtitle = "A. All dams",
    y = NULL
  )

p1 +
  plot_annotation(
    title = "Dams in continental US by primary owner and primary purpose",
    caption = my_caption
  )

Show the code

# following example at https://mjfrigaard.github.io/fm-ggp2/alluvial_charts.html

big_dams_wide <- dams |>
  st_drop_geometry() |>
  select(nidId, big_dam, primaryOwnerTypeId, primaryPurposeId) |>
  filter(big_dam, 
         primaryOwnerTypeId != "Other or not specified",
         primaryPurposeId != "Other or not specified") |>
  mutate(primaryPurposeId = fct_lump_n(primaryPurposeId, 7)) |>
  count(primaryOwnerTypeId, primaryPurposeId) |>
  mutate(primaryOwnerTypeId = fct_reorder(primaryOwnerTypeId, n, sum),
         primaryPurposeId = fct_reorder(primaryPurposeId, n, sum)
  )

p1 <- big_dams_wide |>
  ggplot(aes(axis1 = primaryOwnerTypeId, 
             axis2 = primaryPurposeId,
             y = n)) +
  scale_x_discrete(
    limits = c("Owner", "Purpose"),
    expand = c(0.1, 0.07),
    position = "top") +
  geom_alluvium(aes(fill = primaryPurposeId),
                show.legend = FALSE) +
  geom_stratum(linewidth = 0.05,
               alpha = 0.8) +
  geom_text(stat = "stratum",
    aes(label = after_stat(stratum)),
      size = 3) +
  scale_fill_brewer(palette = "Paired") +
  theme(axis.text.x = element_text(size = rel(2))) +
  labs(
    subtitle = "A. All dams",
    y = NULL
  )

p1 +
  plot_annotation(
    title = "Big dams in continental US by primary owner and primary purpose",
    caption = my_caption
  )

Figure 6.10: Big dams by owner and purpose

6.4 Ratios

Ratios are one thing expressed as a multiple of the other, i.e., divided by the other. Let’s look at surfaceArea::nidStorage. Dams built in hilly areas seem to me to be more likely to have low ratios of surfaceArea to nidStorage, i.e., the dams are likely to be deeper rather than broader. Can we substantiate or disprove this conjecture?

First, some scatter plots in Figure 6.11:

Panel A provides context: state population by area.
Panel B plots the number dams in each state by state area.
Panel C plots number of dams in the state by state population.
And panel D plots dam storage (nidStorage) by number of dams in the state.

In each plot one or more regional clusters are visible.

Show the code

p1 <- dams_by_state_with_pop |>
  ggplot() +
  geom_text(aes(usgs_total_area_less_great_lakes, pop, label = state_abb, color = state_region),
            check_overlap = FALSE,
            show.legend = FALSE,
            key_glyph = "point") +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.04))) +
  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.04))) +
  labs(
    subtitle = "A: Population by surface area",
    x = "State area (sq mi)",
    y = "Population"
  )

p2 <- dams_by_state_with_pop |>
  ggplot() +
  geom_text(aes(usgs_total_area_less_great_lakes, n, label = state_abb, color = state_region),
            check_overlap = FALSE,
            key_glyph = "point") +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.04))) +
  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.04))) +
  guides(color = guide_legend(position = "inside",
                              override.aes = list(size = 3))) +
  theme(legend.position.inside = c(0.8, 0.7)) +
  labs(
    subtitle = "B: Number of dams by area",
    x = "State area (sq mi)",
    y = "Number of dams in the NID"
  )

p3 <- dams_by_state_with_pop |>
  ggplot() +
  geom_text(aes(pop, n, label = state_abb, color = state_region),
            check_overlap = FALSE,
            show.legend = FALSE,
            key_glyph = "point") +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.04))) +
  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.04))) +
  labs(
    subtitle = "C: Number of dams by population",
    x = "Population",
    y = "Number of dams in the NID"
  )

p4 <- dams_by_state_with_pop |>
  left_join(
    storage_by_state_mi_corrected |>
      select(state_abb, nidStorage = n),
    by = "state_abb"
  ) |>
  ggplot() +
  geom_text(aes(n, nidStorage, label = state_abb, color = state_region),
            check_overlap = FALSE,
            show.legend = FALSE,
            key_glyph = "point") +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.04))) +
  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.04))) +
  labs(
    subtitle = "D: Dams' volume of water under management",
    x = "Number of dams in the NID",
    y = "nidStorage (acre ft)"
  )

p1 + p2 + p3 + p4 +
  plot_annotation(
    title = "State land area, population, and number of dams",
    subtitle = "Area less Great Lakes (if applicable). Michigan corrected.",
    caption = my_caption_nid_census
  )

Figure 6.11: State land area, population, and number of dams

Figure 6.12 below plots the ratio of surfaceArea to water under management (nidStorage) using medians (panels A and C) and means (panels B and C). In the plots that employ means, outliers and data errors have more effect. For example, in panel B Washington (WA), Colorado (CO) and Oregon (OR) are higher in the plot, from which I infer that each state has some large, shallow reservoirs (or large data errors).

I found and removed two data errors for New Mexico. Note the reported discrepancy in damHeight and nidStorage for these two dams with very large surfaceArea when they are placed in context with the ten dams in NM with the most surface area (Table 6.3).

Show the code

dams_no_geo |>
  filter(state_abb == "NM") |>
  slice_max(order_by = surfaceArea,
            n = 10) |>
  select(name, nidId, surfaceArea, nidHeight, nidStorage) |>
  gt() |>
  tab_options(table.font.size = 10) |>
  tab_header(md(glue("**Dams in New Mexico with largest reported `surfaceArea`**",
                     "<br>*Acres. Click column headers to sort*"))) |>
  tab_source_note(md(glue("*Source: NID. Table: Daniel Moul.*"))) |>
  fmt_number(columns = c(surfaceArea, nidHeight, nidStorage),
             decimals = 0) |>
  tab_style(
    style = list(
      cell_fill(color = "lightyellow")
    ),
    locations = cells_body(
      rows = str_detect(name, "Encino") | str_detect(name, "Alamos")
    )
    ) |>
  opt_interactive(
    use_pagination = FALSE
  )

Dams in New Mexico with largest reported surfaceArea
Acres. Click column headers to sort

Source: NID. Table: Daniel Moul.

Table 6.3: Dams in New Mexico with most reported surfaceArea of water behind dams

The ratio dam surface area::nidStorage is an indicator of the local geography: a high ratio indicates a wide, shallow reservoir behind the dam; a low ratio indicates a deep reservoir in a valley with steep gradients or perhaps a quarry or other excavated area.

Plotting median and mean state ratios by state area per dam (Figure 6.12) reveals some regional clusters.

Show the code

p1 <- dams_ratios_per_state |>
  ggplot(aes(sqmi_per_dam, median_surface_area_to_nidStorage, color = state_region)) +
  geom_text(aes(label = state_abb),
            check_overlap = FALSE,
            show.legend = FALSE,
            key_glyph = "point") +
  geom_mark_hull() +
  scale_x_log10(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.04))) +
  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.04))) +
  guides(color = "none") +
  labs(
    subtitle = "A: Median(dam surface area::nidStorage) by sq mi per dam",
    x = "State area (sq mi) per dam (log10 scale)",
    y = "Median ratio"
  )

p2 <- dams_ratios_per_state |>
  ggplot(aes(sqmi_per_dam, avg_surface_area_to_nidStorage, color = state_region)) +
  geom_text(aes(label = state_abb),
            check_overlap = FALSE,
            show.legend = FALSE,
            key_glyph = "point") +
  geom_mark_hull() +
  scale_x_log10(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.04))) +
  scale_y_log10(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.04))) +
  guides(color = "none") +
  labs(
    subtitle = "B: Mean(dam surface area::nidStorage) by sq mi per dam",
    x = "State area (sq mi) per dam (log10 scale)",
    y = "Mean ratio (log10 scale)"
  )

p1 + p2 +
  plot_annotation(
    title = "State ratios: median and mean surface area to dam volume",
    subtitle = "Colored by region",
    caption = my_caption
  )

Figure 6.12: Median and mean state values for ratio of `surfaceArea` to `nidStorage`

6.5 Dam completion and modification

Public dams are major civil works projects that require political will, budgeting, financing, and on-going funding to operate and maintain. Most dams were built after World War II and prior to the Reagan administration in the eighties.

6.5.1 New dams

Show the code

dta_for_plot <- dams_no_geo |>
  select(name, state, nidHeight, nidHeightId, yearCompleted, yearCompletedId, yearsModified)

p1 <- dta_for_plot |>
  ggplot() +
  geom_bar(aes(yearCompletedId, fill = nidHeightId)) +
  # scale_x_continuous(expand = expansion(mult = c(0.01, 0.04))) +
  scale_y_continuous(label = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.01, 0.04))) +
  guides(fill = "none") +
  coord_flip() +
  theme(plot.title.position = "plot") +
  labs(
    subtitle = "A: By decade",
    x = NULL,
    y = NULL
  )

p2 <- dta_for_plot |>
  filter(yearCompletedId != "Undetermined") |>
  ggplot() +
  geom_vline(xintercept = c(1950, 1979),
             lty = 2,
             linewidth = 0.2,
             alpha = 0.5) +
  stat_ecdf(aes(yearCompleted)) +
  scale_x_continuous(breaks = c(1850, 1900, 1950, 1979, 2000)) +
  scale_y_continuous(label = label_percent(),
                     expand = expansion(mult = c(0.0, 0.04))) +
  theme(plot.title.position = "plot") +
  coord_cartesian(xlim = c(1850, NA)) +
  labs(
    subtitle = "B: Cumulative distribution: year completed",
    x = NULL,
    y = "Percent of all dams with year built data"
  )

p3 <- dta_for_plot |>
  filter(yearCompletedId != "Undetermined") |>
  ggplot() +
  geom_vline(xintercept = c(1950, 1979),
             lty = 2,
             linewidth = 0.2,
             alpha = 0.5) +
  geom_density(aes(yearCompleted, color = nidHeightId)) +
  scale_x_continuous(breaks = c(1850, 1900, 1950, 1979, 2000)) +
  guides(color = "none") +
  theme(plot.title.position = "plot") +
  coord_cartesian(xlim = c(1850, NA)) +
  labs(
    subtitle = "C: Density plot: year completed by height",
    x = NULL
  )

p4 <- dta_for_plot |>
  filter(yearCompletedId != "Undetermined") |>
  ggplot() +
  geom_vline(xintercept = c(1950, 1979),
             lty = 2,
             linewidth = 0.2,
             alpha = 0.5) +
  stat_ecdf(aes(yearCompleted, color = nidHeightId)) +
  scale_x_continuous(breaks = c(1850, 1900, 1950, 1979, 2000)) +
  scale_y_continuous(label = label_percent(),
                     expand = expansion(mult = c(0.0, 0.04))) +
  guides(color = guide_legend(position = "inside",
                              override.aes = list(linewidth = 3))) +
  theme(plot.title.position = "plot",
        legend.position.inside = c(0.3, 0.7)) +
  coord_cartesian(xlim = c(1850, NA)) +
  labs(
    subtitle = "D: Cumulative distribution: year completed by dam height",
    x = NULL,
    y = "Percent of all dams with year built data"
  )

p1 + p2 + p3 + p4 +
  plot_annotation(
    title = "The heyday of dam building in the US was 1950-1979",
    subtitle = "It's likely that the dams of undetermined nidHeight built since 1950 are small dams (plots C and D).",
    caption = my_caption
  )

Figure 6.13: Year dams were completed and their height

6.5.2 Primary purposes

Show the code

yearCompletedId_levels <- c("Before 1900", "1900-1909", "1910-1919", "1920-1929", "1930-1939",
                            "1940-1949", "1950-1959", "1960-1969", "1970-1979", "1980-1989",
                            "1990-1999", "Since 2000", "Undetermined")

yearCompletedId_breaks <- c("Before 1900", # "1900-1909", 
                            "1910-1919", # "1920-1929", 
                            "1930-1939", # "1940-1949", 
                            "1950-1959", # "1960-1969", 
                            "1970-1979", # "1980-1989",
                            "1990-1999" # , "Since 2000", "Undetermined"
                            )

year_build_by_primary_purpose <- dams_no_geo |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  filter(yearCompletedId != "Undetermined",
         primaryPurposeId != "Other or not specified") |>
  mutate(yearCompletedId = factor(yearCompletedId, 
                                  levels = yearCompletedId_levels, 
                                  ordered = TRUE)) |>
  mutate(primaryPurposeId = fct_lump_n(primaryPurposeId, n = 5)) |>
  count(yearCompletedId, primaryPurposeId, state_region)

year_build_by_primary_purpose_big_dams <- dams_no_geo |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId, big_dam) |>
  filter(yearCompletedId != "Undetermined",
         primaryPurposeId != "Other or not specified") |>
  mutate(yearCompletedId = factor(yearCompletedId, 
                                  levels = yearCompletedId_levels, 
                                  ordered = TRUE)) |>
  filter(big_dam) |>
  mutate(primaryPurposeId = fct_lump_n(primaryPurposeId, n = 5)) |>
  count(yearCompletedId, primaryPurposeId, state_region)


p1 <- year_build_by_primary_purpose |>
  ggplot() +
  geom_line(aes(yearCompletedId, n, color = primaryPurposeId, group = primaryPurposeId)) +
  # geom_point(aes(yearCompletedId, n, color = primaryPurposeId),
  #             size = 0.75,
  #            show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(label = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.01, 0.04))) +
  guides(color = guide_legend(#position = "inside",
                              override.aes = list(linewidth = 3))) +
  expand_limits(y = 0) +
  facet_wrap(~state_region) +
  theme(plot.title.position = "plot",
        # legend.position.inside = c(I(0.15), I(0.85)),
        legend.position = "bottom",
        axis.text.x = element_text(angle = 30, hjust = 1)) +
  labs(
    subtitle = "A: Dams",
    x = NULL,
    y = NULL
  )

p2 <- year_build_by_primary_purpose_big_dams |>
  ggplot() +
  geom_line(aes(yearCompletedId, n, color = primaryPurposeId, group = primaryPurposeId)) +
  # geom_point(aes(yearCompletedId, n, color = primaryPurposeId),
  #             size = 0.75,
  #            show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(label = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.01, 0.04))) +
  scale_color_viridis_d(end = 0.9) +
  guides(color = guide_legend(#position = "inside",
                              override.aes = list(linewidth = 3))) +
  expand_limits(y = 0) +
  facet_wrap(~state_region) +
  theme(plot.title.position = "plot",
        # legend.position.inside = c(I(0.15), I(0.85)),
        legend.position = "bottom",
        axis.text.x = element_text(angle = 30, hjust = 1)) +
  labs(
    subtitle = "B: Big dams",
    x = NULL,
    y = NULL
  )

p1 + p2 +
  plot_annotation(
    title = "Most dams were built in the South. Most big dams were built in the West",
    subtitle = glue("New dams and their primary purpose, by region",
                    "\nIncludes dams for which purpose and year completed data are available"),
    caption = my_caption
  )

Figure 6.14: New dams and their primary purpose, by region

Show the code

year_build_by_primary_purpose <- dams_no_geo |>
  filter(state_region == "South") |>
  filter(yearCompletedId != "Undetermined",
         primaryPurposeId != "Other or not specified") |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  mutate(yearCompletedId = factor(yearCompletedId, 
                                  levels = yearCompletedId_levels, 
                                  ordered = TRUE)) |>
  mutate(primaryPurposeId = fct_lump_n(primaryPurposeId, n = 5)) |>
  count(yearCompletedId, primaryPurposeId, state_region, state_abb)

year_build_by_primary_purpose_big_dams <- dams_no_geo |>
  filter(big_dam) |>
  filter(state_region == "South") |>
  filter(yearCompletedId != "Undetermined",
         primaryPurposeId != "Other or not specified") |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  mutate(yearCompletedId = factor(yearCompletedId, 
                                  levels = yearCompletedId_levels, 
                                  ordered = TRUE)) |>
  mutate(primaryPurposeId = fct_lump_n(primaryPurposeId, n = 5)) |>
  count(yearCompletedId, primaryPurposeId, state_region, state_abb)


p1 <- year_build_by_primary_purpose |>
  ggplot() +
  geom_line(aes(yearCompletedId, n, color = primaryPurposeId, group = primaryPurposeId)) +
  # geom_point(aes(yearCompletedId, n, color = primaryPurposeId),
  #            size = 0.75,
  #            show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(#label = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.01, 0.04))) +
  guides(color = guide_legend(# position = "inside",
                              override.aes = list(linewidth = 3))) +
  expand_limits(y = 0) +
  facet_wrap(~state_abb)  +
  theme(plot.title.position = "plot",
        # legend.position.inside = c(I(0.15), I(0.85)),
        legend.position = "bottom",
        axis.text.x = element_text(angle = 30, hjust = 1)) +
  labs(
    subtitle = "A: Dams",
    x = NULL,
    y = NULL
  )

p2 <- year_build_by_primary_purpose_big_dams |>
  ggplot() +
  geom_line(aes(yearCompletedId, n, color = primaryPurposeId, group = primaryPurposeId)) +
  # geom_point(aes(yearCompletedId, n, color = primaryPurposeId),
  #             size = 0.75,
  #            show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(breaks = 2 * 0:5,
                     #label = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.01, 0.04))) +
  scale_color_viridis_d(end = 0.9) +
  guides(color = guide_legend(# position = "inside",
                              override.aes = list(linewidth = 3))) +
  expand_limits(y = 0) +
  facet_wrap(~state_abb)  +
  theme(plot.title.position = "plot",
        # legend.position.inside = c(I(0.15), I(0.85)),
        legend.position = "bottom",
        axis.text.x = element_text(angle = 30, hjust = 1)) +
  labs(
    subtitle = "B: Big dams",
    x = NULL,
    y = NULL
  )

p1 + p2 +
  plot_annotation(
    title = glue("Most of the dams in the South were built in Texas and Oklahoma", 
                 "\nfor flood risk reduction"),
    subtitle = glue("New dams in the South, by primary purpose",
                    "\nIncludes dams for which purpose and year completed data are available"),
    
    caption = my_caption
  )

Figure 6.15: New dams and their purpose (South region)

Show the code

year_build_by_primary_purpose <- dams_no_geo |>
  filter(yearCompletedId != "Undetermined",
         primaryPurposeId != "Other or not specified") |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  mutate(yearCompletedId = factor(yearCompletedId, 
                                  levels = yearCompletedId_levels, 
                                  ordered = TRUE)) |>
  mutate(primaryPurposeId = fct_lump_n(primaryPurposeId, n = 5)) |>
  count(yearCompletedId, primaryPurposeId, state_region, state_abb) |>
  mutate(primaryPurposeId = fct_reorder(primaryPurposeId, -n, min))

dta_highlight <- year_build_by_primary_purpose |>
  slice_max(order_by = n, n = 1,
            by = c(primaryPurposeId))

p1 <- year_build_by_primary_purpose |>
  ggplot() +
  geom_line(aes(yearCompletedId, n, group = state_abb),
            linewidth = 0.2,
            alpha = 0.2) + 
  geom_point(data = dta_highlight,
             aes(yearCompletedId, n, ),
             size = 2,
             color = "steelblue",
             show.legend = FALSE) +
  geom_text(data = dta_highlight,
            aes(yearCompletedId, n, label = state_abb),
            nudge_x = 1,
            color = "steelblue",
            show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(expand = expansion(mult = c(0.01, 0.1))) +
  guides(color = guide_legend(override.aes = list(linewidth = 3))) +
  expand_limits(y = 0) +
  facet_wrap(~primaryPurposeId, scales = "free_y")  +
  theme(plot.title.position = "plot",
        legend.position = "top",
        axis.text.x = element_text(angle = 30, hjust = 1)
        ) +
  labs(
    subtitle = "A: By decade",
    x = NULL,
    y = NULL
  )

year_build_by_primary_purpose <- dams_no_geo |>
  filter(yearCompletedId != "Undetermined",
         primaryPurposeId != "Other or not specified") |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  mutate(yearCompletedId = factor(yearCompletedId, 
                                  levels = yearCompletedId_levels, 
                                  ordered = TRUE)) |>
  mutate(primaryPurposeId = fct_lump_n(primaryPurposeId, n = 5)) |>
  count(yearCompletedId, primaryPurposeId, state_abb) |>
  complete(yearCompletedId, primaryPurposeId, state_abb,
           fill = list(n = 0)) |>
  mutate(n_cum = cumsum(n),
         idx = row_number(),
         .by = c(primaryPurposeId, state_abb)) |>
  mutate(primaryPurposeId = fct_reorder(primaryPurposeId, -n_cum, min))

dta_highlight <- year_build_by_primary_purpose |>
  filter(yearCompletedId != "Undetermined", 
         yearCompletedId == "Since 2000") |>
  slice_max(order_by = n_cum, n = 1,
            by = primaryPurposeId)

p2 <- year_build_by_primary_purpose |>
  ggplot() +
  geom_line(aes(yearCompletedId, n_cum, group = state_abb),
            linewidth = 0.2,
            alpha = 0.2) + 
  geom_text(data = dta_highlight,
            aes(yearCompletedId, n_cum, label = state_abb),
            nudge_x = 0.5,
            color = "steelblue",
            show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(expand = expansion(mult = c(0.01, 0.1))) +
  guides(color = guide_legend(override.aes = list(linewidth = 3))) +
  expand_limits(y = 0) +
  facet_wrap(~primaryPurposeId, scales = "free_y")  +
  theme(plot.title.position = "plot",
        legend.position = "top",
        axis.text.x = element_text(angle = 30, hjust = 1)
        ) +
  labs(
    subtitle = "B: Cumulative",
    x = NULL,
    y = NULL
  )

p1 / p2 +
  plot_annotation(
    title = "The most common purposes vary by state\ndue to geography and local priorities",
    subtitle = glue("New dams in each state, by primary purpose",
                    "\nIncludes dams for which purpose and year completed data are available"),
    caption = my_caption
  )

Figure 6.16: New dams in each state, by primary purpose

Unpacking the “other” category in Figure 6.16 above, Figure 6.17 offers some surprises.

I did not expect New York to have the most hydroelectric dams. Or the most dams for navigation, which appears to be mostly related to the Erie canal “from 1905 to 1918 when the”Barge Canal” was built and over half the original route was abandoned.”²
Missouri has the most dams for debris control and grade stabilization, presumably due to mining.
Florida has the most dams for tailings. I expected another state with a lot of mining to lead here.

Show the code

exclude_purposes <- c("Recreation", "Flood Risk Reduction", "Fire Protection, Stock,\nOr Small Fish Pond",
                      "Irrigation", "Water Supply")

year_build_by_primary_purpose <- dams_no_geo |>
  filter(yearCompletedId != "Undetermined",
         primaryPurposeId != "Other or not specified") |>
  filter(!primaryPurposeId %in% exclude_purposes) |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  mutate(yearCompletedId = factor(yearCompletedId, 
                                  levels = yearCompletedId_levels, 
                                  ordered = TRUE)) |>
  count(yearCompletedId, primaryPurposeId, state_abb) |>
  complete(yearCompletedId, primaryPurposeId, state_abb,
           fill = list(n = 0)) |>
  mutate(n_cum = cumsum(n),
         idx = row_number(),
         .by = c(primaryPurposeId, state_abb)) |>
  mutate(primaryPurposeId = fct_reorder(primaryPurposeId, -n_cum, min))

dta_highlight <- year_build_by_primary_purpose |>
  filter(yearCompletedId != "Undetermined", 
         yearCompletedId == "Since 2000") |>
  slice_max(order_by = n_cum, n = 1,
            by = primaryPurposeId)

p1 <- year_build_by_primary_purpose |>
  ggplot() +
  geom_line(aes(yearCompletedId, n_cum, group = state_abb),
            linewidth = 0.2,
            alpha = 0.2) + 
  geom_text(data = dta_highlight,
            aes(yearCompletedId, n_cum, label = state_abb),
            nudge_x = 0.5,
            color = "steelblue",
            show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(expand = expansion(mult = c(0.01, 0.1))) +
  guides(color = guide_legend(override.aes = list(linewidth = 3))) +
  expand_limits(y = 0) +
  facet_wrap(~primaryPurposeId, scales = "free_y")  +
  theme(plot.title.position = "plot",
        legend.position = "top",
        axis.text.x = element_text(angle = 30, hjust = 1)
        ) +
  labs(
    subtitle = "A: Dams",
    x = NULL,
    y = NULL
  )
 
p1 +
  plot_annotation(
    title = "New York and Missouri are top states\nfor two of the 'other' categories",
    subtitle = glue("'Other' cumulative new dams in each state, by primary purpose",
                    "\nIncludes dams for which purpose and year completed data are available"),
    caption = my_caption
  )

Figure 6.17: Unpacking the ‘other’ category

Show the code

year_build_by_primary_purpose <- dams_no_geo |>
  filter(yearCompletedId != "Undetermined",
         primaryPurposeId != "Other or not specified") |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  mutate(yearCompletedId = factor(yearCompletedId, 
                                  levels = yearCompletedId_levels, 
                                  ordered = TRUE)) |>
  mutate(primaryPurposeId = fct_lump_n(primaryPurposeId, n = 5)) |>
  count(yearCompletedId, primaryPurposeId, state_region, state_abb)

p1 <- year_build_by_primary_purpose |>
  ggplot() +
  geom_line(aes(yearCompletedId, n, color = primaryPurposeId, group = primaryPurposeId)) +
  # geom_point(aes(yearCompletedId, n, color = primaryPurposeId),
  #            size = 0.75,
  #            show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(label = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.01, 0.04))) +
  guides(color = guide_legend(override.aes = list(linewidth = 3),
                              nrow = 1)) +
  expand_limits(y = 0) +
  facet_wrap(~state_abb, scales = "free_y")  +
  theme(plot.title.position = "plot",
        legend.position = "top",
        axis.text.x = element_text(angle = 30, hjust = 1)
        ) +
  labs(
    x = NULL,
    y = NULL
  )

p1 +
  plot_annotation(
    title = "New dams and their primary purpose, by state",
    subtitle = glue("Includes dams for which purpose and year completed data are available", 
                    "\nY axis scale varies"),
    caption = my_caption
  )

Figure 6.18: New dams and their primary purpose, by state

Show the code

year_build_by_primary_purpose_big_dams <- dams_no_geo |>
  filter(big_dam) |>
  # filter(state_region == "South") |>
  filter(yearCompletedId != "Undetermined",
         primaryPurposeId != "Other or not specified") |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  mutate(yearCompletedId = factor(yearCompletedId,
                                  levels = yearCompletedId_levels,
                                  ordered = TRUE)) |>
  mutate(primaryPurposeId = fct_lump_n(primaryPurposeId, n = 5)) |>
  count(yearCompletedId, primaryPurposeId, state_region, state_abb)

p2 <- year_build_by_primary_purpose_big_dams |>
  ggplot() +
  geom_line(aes(yearCompletedId, n, color = primaryPurposeId, group = primaryPurposeId)) +
  geom_point(aes(yearCompletedId, n, color = primaryPurposeId),
              size = 0.75,
             show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(#breaks = 2 * 0:5,
                     expand = expansion(mult = c(0.01, 0.04))) +
  scale_color_viridis_d(end = 0.90) +
  guides(color = guide_legend(override.aes = list(linewidth = 3),
                              nrow = 1)) +
  expand_limits(y = 0) +
  facet_wrap(~state_abb)  +
  theme(plot.title.position = "plot",
        legend.position = "top",
        axis.text.x = element_text(angle = 30, hjust = 1)) +
  labs(
    x = NULL,
    y = NULL
  )

p2 +
  plot_annotation(
    title = "California has the most big dams",
    subtitle = glue("New big dams and their primary purpose, by state",
                    "Includes dams for which purpose and year completed data are available"),
    caption = my_caption
  )

Figure 6.19: New big dams and their primary purpose, by state

6.5.3 All purposes

Since dams are often built for multiple purposes, and it can be a judgement call to decide which purpose is “primary”, this section considers all documented purposes.

Show the code

purposes_primary <- dams_no_geo |>
  count(primaryPurposeId)

purposes_secondary <- dams_no_geo_all_purposes |>
  count(purpose_id)

purposes_compared <- 
  left_join(
    purposes_primary |>
      rename(purpose_id = primaryPurposeId),
    purposes_secondary,
    by = "purpose_id",
    suffix = c("_primary", "_all")
  ) |>
  filter(purpose_id != "Other or not specified") |>
  mutate(pct_primary_of_all = n_primary / n_all)

p1 <- purposes_compared |>
  mutate(purpose_id = fct_reorder(purpose_id, pct_primary_of_all)) |>
  ggplot() +
  geom_col(aes(n_all, purpose_id),
           alpha = 0.3) +
  geom_col(aes(n_primary, purpose_id),
           alpha = 0.3) +
  geom_text(aes(-10, purpose_id, label = percent(pct_primary_of_all, accuracy = 1)),
            nudge_x = -500,
            hjust = 1) +
  scale_x_continuous(label = label_number(scale_cut = cut_short_scale())) +
  expand_limits(x = -4000) +
  labs(
    subtitle = "A: Count of purposes and\nproportion of primary purposes",
    x = "Count",
    y = NULL
  )

tailings_pct <- purposes_compared |>
  filter(purpose_id == "Tailings") |>
  pull(pct_primary_of_all)

p2 <- dams_no_geo |>
  mutate(n_purposes = str_count(purposeIds, ";") + 1) |>
  ggplot() +
  geom_histogram(aes(n_purposes),
                 binwidth = 1) +
  scale_y_continuous(label = label_number(scale_cut = cut_short_scale())) +
  labs(
    subtitle = "B: Number of purposes per dam",
    x = NULL,
    y = "Count"
  )

p2 <- dams_no_geo |>
  mutate(n_purposes = str_count(purposeIds, ";") + 1) |>
  filter(!is.na(n_purposes)) |>
  arrange(desc(n_purposes)) |>
  count(n_purposes) |>
  arrange(desc(n)) |>
  mutate(pct_n = n / sum(n),
         n_cum = cumsum(n),
         n_purpose_label = glue("{n_purposes} purposes"),
         n_purpose_label = fct_rev(n_purpose_label),
         tmp = 0) |>
  ggplot() +
  geom_line(aes(n_cum, n_purpose_label, group = tmp),
            linewidth = 0.5,
            color = "steelblue") +
  geom_point(aes(n_cum, n_purpose_label, group = tmp),
            size = 2,
            color = "steelblue") +
  geom_text(aes(x = 15000, y = n_purpose_label, label = n),
            hjust = 1) +
  geom_text(aes(x = 30000, y = n_purpose_label, label = percent(pct_n, accuracy = 0.1)),
            hjust = 1) +
  scale_x_continuous(label = label_number(scale_cut = cut_short_scale())) +
  expand_limits(x = 0) +
  labs(
    subtitle = "B: Number of purposes per dam",
    x = "Cumulative number of dams",
    y = NULL
  )


p1 + p2 +
  plot_annotation(
    title = "Purposes: primary purpose\nas percent of all purpose mentions",
    subtitle = glue("Ordered by percent of purposes in which this purpose is primary. For example,",
                    " {percent(tailings_pct, accuracy = 1)} of the cases",
                    "\nin which 'Tailings' is mentioned, it's the primary purpose.", 
                    " Or stated another way,",
                    " in {percent(1 - tailings_pct, accuracy = 1)} of the cases,",
                    "\n'Tailings' is a secondary purpose"),
    caption = my_caption
  )

Figure 6.20: Primary and all purposes compared

Show the code

year_build_by_primary_purpose <- dams_no_geo |>
  left_join(dams_no_geo_all_purposes |>
              select(nidId, purpose_id),
            by = join_by(nidId)) |>
  filter(yearCompletedId != "Undetermined",
         primaryPurposeId != "Other or not specified") |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, purpose_id, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  mutate(yearCompletedId = factor(yearCompletedId, 
                                  levels = yearCompletedId_levels, 
                                  ordered = TRUE)) |>
  mutate(purpose_id = fct_lump_n(purpose_id, n = 5)) |>
  count(yearCompletedId, purpose_id, state_region, state_abb) |>
  mutate(purpose_id = fct_reorder(purpose_id, -n, min))

dta_highlight <- year_build_by_primary_purpose |>
  slice_max(order_by = n, n = 1,
            by = c(purpose_id))

p1 <- year_build_by_primary_purpose |>
  ggplot() +
  geom_line(aes(yearCompletedId, n, group = state_abb),
            linewidth = 0.2,
            alpha = 0.2) + 
  geom_point(data = dta_highlight,
             aes(yearCompletedId, n, ),
             size = 2,
             color = "steelblue",
             show.legend = FALSE) +
  geom_text(data = dta_highlight,
            aes(yearCompletedId, n, label = state_abb),
            nudge_x = 1,
            color = "steelblue",
            show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(expand = expansion(mult = c(0.01, 0.1))) +
  guides(color = guide_legend(override.aes = list(linewidth = 3))) +
  expand_limits(y = 0) +
  facet_wrap(~purpose_id, scales = "free_y")  +
  theme(plot.title.position = "plot",
        legend.position = "top",
        axis.text.x = element_text(angle = 30, hjust = 1)
        ) +
  labs(
    subtitle = "A: By decade",
    x = NULL,
    y = NULL
  )

year_build_by_primary_purpose <- dams_no_geo |>
  left_join(dams_no_geo_all_purposes |>
              select(nidId, purpose_id),
            by = join_by(nidId)) |>
  filter(yearCompletedId != "Undetermined",
         primaryPurposeId != "Other or not specified") |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, purpose_id, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  mutate(yearCompletedId = factor(yearCompletedId, 
                                  levels = yearCompletedId_levels, 
                                  ordered = TRUE)) |>
  mutate(purpose_id = fct_lump_n(purpose_id, n = 5)) |>
  count(yearCompletedId, purpose_id, state_abb) |>
  complete(yearCompletedId, purpose_id, state_abb,
           fill = list(n = 0)) |>
  mutate(n_cum = cumsum(n),
         idx = row_number(),
         .by = c(purpose_id, state_abb)) |>
  mutate(purpose_id = fct_reorder(purpose_id, -n_cum, min))

dta_highlight <- year_build_by_primary_purpose |>
  filter(yearCompletedId != "Undetermined", 
         yearCompletedId == "Since 2000") |>
  slice_max(order_by = n_cum, n = 1,
            by = purpose_id)

p2 <- year_build_by_primary_purpose |>
  ggplot() +
  geom_line(aes(yearCompletedId, n_cum, group = state_abb),
            linewidth = 0.2,
            alpha = 0.2) + 
  geom_text(data = dta_highlight,
            aes(yearCompletedId, n_cum, label = state_abb),
            nudge_x = 0.5,
            color = "steelblue",
            show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(expand = expansion(mult = c(0.01, 0.1))) +
  guides(color = guide_legend(override.aes = list(linewidth = 3))) +
  expand_limits(y = 0) +
  facet_wrap(~purpose_id, scales = "free_y")  +
  theme(plot.title.position = "plot",
        legend.position = "top",
        axis.text.x = element_text(angle = 30, hjust = 1)
        ) +
  labs(
    subtitle = "B: Cumulative",
    x = NULL,
    y = NULL
  )

p1 / p2 +
  plot_annotation(
    title = "The most common purposes vary by state\ndue to geography and local priorities",
    subtitle = glue("New dams in each state, by purpose",
                    "\nIncludes dams for which purpose and year completed data are available"),
    caption = my_caption
  )

Figure 6.21: New dams in each state, by purpose

Unpacking the “other” category when considering all purposes in Figure 6.21 above, the surprises in seen in Figure 6.17 have gone away in Figure 6.22:

California overtook New York in number of hydroelectric dams
Texas overtook Missouri in dams for debris control (but not for grade stabilization)

Florida continues to have many more dams for tailings than other states, which remains a surprise..

Show the code

exclude_purposes <- c("Recreation", "Flood Risk Reduction", "Fire Protection, Stock,\nOr Small Fish Pond",
                      "Irrigation", "Water Supply")

year_build_by_primary_purpose <- dams_no_geo |>
  left_join(dams_no_geo_all_purposes |>
              select(nidId, purpose_id),
            by = join_by(nidId)) |>
  filter(yearCompletedId != "Undetermined",
         purpose_id != "Other or not specified",
         purpose_id != "Other") |>
  filter(!purpose_id %in% exclude_purposes) |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, purpose_id, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  mutate(yearCompletedId = factor(yearCompletedId, 
                                  levels = yearCompletedId_levels, 
                                  ordered = TRUE)) |>
  count(yearCompletedId, purpose_id, state_abb) |>
  complete(yearCompletedId, purpose_id, state_abb,
           fill = list(n = 0)) |>
  mutate(n_cum = cumsum(n),
         idx = row_number(),
         .by = c(purpose_id, state_abb)) |>
  mutate(purpose_id = fct_reorder(purpose_id, -n_cum, min))

dta_highlight <- year_build_by_primary_purpose |>
  filter(yearCompletedId != "Undetermined", 
         yearCompletedId == "Since 2000") |>
  slice_max(order_by = n_cum, n = 1,
            by = purpose_id)

p1 <- year_build_by_primary_purpose |>
  ggplot() +
  geom_line(aes(yearCompletedId, n_cum, group = state_abb),
            linewidth = 0.2,
            alpha = 0.2) + 
  geom_text(data = dta_highlight,
            aes(yearCompletedId, n_cum, label = state_abb),
            nudge_x = 0.5,
            color = "steelblue",
            show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(expand = expansion(mult = c(0.01, 0.1))) +
  guides(color = guide_legend(override.aes = list(linewidth = 3))) +
  expand_limits(y = 0) +
  facet_wrap(~purpose_id, scales = "free_y")  +
  theme(plot.title.position = "plot",
        legend.position = "top",
        axis.text.x = element_text(angle = 30, hjust = 1)
        ) +
  labs(
    # subtitle = "A: Dams",
    x = NULL,
    y = NULL
  )
 
p1 +
  plot_annotation(
    title = "New York and Missouri are top states\nfor two of the 'other' categories",
    subtitle = glue("'Other' cumulative new dams in each state, by purpose",
                    "\nIncludes dams for which purpose and year completed data are available"),
    caption = my_caption
  )

Figure 6.22: Unpacking the ‘other’ category in purposes

Show the code

year_build_by_primary_purpose <- dams_no_geo |>
  left_join(dams_no_geo_all_purposes |>
              select(nidId, purpose_id),
            by = join_by(nidId)) |>
  filter(yearCompletedId != "Undetermined",
         primaryPurposeId != "Other or not specified") |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, purpose_id, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  mutate(yearCompletedId = factor(yearCompletedId, 
                                  levels = yearCompletedId_levels, 
                                  ordered = TRUE)) |>
  mutate(purpose_id = fct_lump_n(purpose_id, n = 5)) |>
  count(yearCompletedId, purpose_id, state_region, state_abb)

p1 <- year_build_by_primary_purpose |>
  ggplot() +
  geom_line(aes(yearCompletedId, n, color = purpose_id, group = purpose_id)) +
  # geom_point(aes(yearCompletedId, n, color = primaryPurposeId),
  #            size = 0.75,
  #            show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(#label = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.01, 0.04))) +
  guides(color = guide_legend(override.aes = list(linewidth = 3),
                              nrow = 1)) +
  expand_limits(y = 0) +
  facet_wrap(~state_abb, scales = "free_y")  +
  theme(plot.title.position = "plot",
        legend.position = "top",
        axis.text.x = element_text(angle = 30, hjust = 1)
        ) +
  labs(
    x = NULL,
    y = NULL
  )

p1 +
  plot_annotation(
    title = "New dams and their purposes, by state",
    subtitle = glue("Includes dams for which purpose and year completed data are available", 
                    "\nY axis scale varies"),
    caption = my_caption
  )

Figure 6.23: New dams and their purposes, by state

Many big dams serve multiple purposes. Compare California and Colorado below with Figure 6.19, which counts only primary purposes.

Show the code

year_build_by_primary_purpose_big_dams <- dams_no_geo |>
  filter(big_dam) |>
  left_join(dams_no_geo_all_purposes |>
              select(nidId, purpose_id),
            by = join_by(nidId)) |>
  filter(yearCompletedId != "Undetermined",
         purpose_id != "Other or not specified") |>
  select(nidId, primaryOwnerTypeId, primaryPurposeId, purpose_id, state, state_abb, state_region, nidStorage, yearCompleted, yearCompletedId) |>
  mutate(yearCompletedId = factor(yearCompletedId,
                                  levels = yearCompletedId_levels,
                                  ordered = TRUE)) |>
  mutate(purpose_id = fct_lump_n(purpose_id, n = 5)) |>
  count(yearCompletedId, purpose_id, state_region, state_abb)

p2 <- year_build_by_primary_purpose_big_dams |>
  ggplot() +
  geom_line(aes(yearCompletedId, n, color = purpose_id, group = purpose_id)) +
  geom_point(aes(yearCompletedId, n, color = purpose_id),
              size = 0.75,
             show.legend = FALSE) +
  scale_x_discrete(breaks = yearCompletedId_breaks) +
  scale_y_continuous(#breaks = 2 * 0:5,
                     expand = expansion(mult = c(0.01, 0.04))) +
  scale_color_viridis_d(end = 0.90) +
  guides(color = guide_legend(override.aes = list(linewidth = 3),
                              nrow = 1)) +
  expand_limits(y = 0) +
  facet_wrap(~state_abb)  +
  theme(plot.title.position = "plot",
        legend.position = "top",
        axis.text.x = element_text(angle = 30, hjust = 1)) +
  labs(
    x = NULL,
    y = NULL
  )

p2 +
  plot_annotation(
    title = "California has the most big dams",
    subtitle = glue("New big dams and their purposes, by state",
                    "\nIncludes dams for which purpose and year completed data are available"),
    caption = my_caption
  )

Figure 6.24: New big dams and their purposes, by state

6.6 Dam modifications

The data set includes records of modifications for a small portion of the dams:

Year (four digits) when major modifications or rehabilitation of dam or major control structures were completed. Major modifications are defined as a structural, foundation, or mechanical construction activity which significantly restores the project to original condition; changes the project’s operation; capacity or structural characteristics (e.g. spillway or seismic modification); or increases the longevity, stability, or safety of the dam and appurtenant structures.

Category describing the type of modification: Foundation, Hydraulic, Mechanical, Seismic, Structural, Other³

Show the code

dta_for_plot_tmp1 <- dams_no_geo |>
  filter(nidHeight > 0,
         !is.na(yearsModified),
         yearsModified != "-1")

n_dams_with_mod_data <- nrow(dta_for_plot_tmp1)
pct_dams_with_mod_data = n_dams_with_mod_data / nrow(dams_no_geo)

dta_for_plot <- dta_for_plot_tmp1 |>
  select(name, state, nidId, nidHeight, nidHeightId, yearCompleted, yearCompletedId, yearsModified, big_dam) |>
  mutate(yearsModified_orig = yearsModified) |>
  separate_longer_delim(yearsModified, delim = ";") |>
  mutate(modified_reason = str_extract(yearsModified, "[A-Z]$"),
         year_modified = parse_number(yearsModified)) |>
  mutate(years_to_mod = year_modified - yearCompleted,
         .by = c(name, nidId)) |>
  mutate(years_since_last_mod = years_to_mod - lag(years_to_mod, default = NA),
         .by = c(name, nidId)) |>
  mutate(years_since_last_mod = ifelse(is.na(years_since_last_mod),
                                       years_to_mod,
                                       years_since_last_mod)) |>
  filter(years_to_mod > 0) |>
  mutate(mod_idx = row_number(),
         .by = c(name, nidId)) |>
  mutate(mod_idx_label = fct_reorder(glue("mod {mod_idx}"), mod_idx)) |>
  distinct(name, nidId, year_modified, .keep_all = TRUE) |> # remove duplicate rows where mods were made for multiple reasons
  mutate(mod_idx_label = fct_lump_n(mod_idx_label, 
                                    n = 5,
                                    other_level = "mod 6+"))

n_dams_with_good_mod_data <- dta_for_plot |>
  distinct(name, nidId) |>
  nrow()

pct_dams_with_good_mod_data = n_dams_with_good_mod_data / nrow(dams_no_geo)

n_mod_one_only <- dta_for_plot |>
  filter(mod_idx_label == "mod 1") |>
  nrow()

n_mod_after_one <- dta_for_plot |>
  filter(mod_idx_label != "mod 1") |>
  nrow()

pct_mod_data_is_mod1 <- n_mod_one_only / (n_mod_one_only + n_mod_after_one)

my_binwidth <- 10

p1 <- dta_for_plot |>
  ggplot() +
  geom_histogram(aes(years_to_mod),
                 binwidth = my_binwidth) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.04))) +
  scale_y_continuous(expand = expansion(mult = c(0.01, 0.04))) +
  labs(
    subtitle = "A: Years from dam completion\nto modification (all mods)",
  )

p2 <- dta_for_plot |>
  filter(mod_idx == 1) |>
  ggplot() +
  geom_histogram(aes(years_to_mod),
                 binwidth = my_binwidth) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.04))) +
  scale_y_continuous(expand = expansion(mult = c(0.01, 0.04))) +
  labs(
    subtitle = "B: Years from dam completion\nto first modification",
  )

p1 + p2 +
  plot_annotation(
    title = "Distributions of years from completion to modifications",
    subtitle = glue("Includes {n_dams_with_good_mod_data} ({percent(pct_dams_with_good_mod_data, accuracy = 1)})",
                    " dams with modification data",
                    ". Binwidth: {my_binwidth} years"),
    caption = my_caption
  )

Figure 6.25: Distibution of years from completion to modifications

In Figure 6.25 above we see the distribution of years from completion to any mod (Figure 6.25 panel A) and to the first model (panel B). The distributions have a similar shape–partly because 85% of dams with modification data are for the first modification.

Even so, when we look at the distribution of years from completion to modification (Figure 6.26), the shapes are similar.

Show the code

my_binwidth <- 10

p1 <- dta_for_plot |>
  ggplot() +
  geom_histogram(aes(years_to_mod),
                 binwidth = my_binwidth) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.04))) +
  scale_y_continuous(expand = expansion(mult = c(0.01, 0.04))) +
  facet_wrap(~mod_idx_label, scales = "free_y") +
  labs(
    title = "Density plot",
  )

p2 <- dta_for_plot |>
  ggplot() +
  geom_density(aes(years_to_mod, color = mod_idx_label)) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.04))) +
  scale_y_continuous(expand = expansion(mult = c(0.01, 0.04))) +
  guides(color = guide_legend(position = "inside")) +
  theme(legend.position.inside = c(0.8, 0.7)) +
  coord_cartesian(x = c(0, 200)) +
  labs(
    title = "Density plot",
    color = NULL
  )

p1 + p2 +
  plot_annotation(
    title = "Similar distributions of years from completion to modification\n(all modifications)",
    subtitle = glue("Includes {n_dams_with_good_mod_data} ({percent(pct_dams_with_good_mod_data, accuracy = 1)})", 
                    " dams with modification data"),
    caption = my_caption
  )

Figure 6.26: Years from completion to last modification

When we look at the years since the last modification (Figure 6.27), the distributions are a similar shape for mods 2+.

Show the code

my_binwidth <- 10

p1 <- dta_for_plot |>
  ggplot() +
  geom_histogram(aes(years_since_last_mod),
                 binwidth = my_binwidth,
                 na.rm = TRUE) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.04))) +
  scale_y_continuous(expand = expansion(mult = c(0.01, 0.04))) +
  facet_wrap(~mod_idx_label, scales = "free_y") +
  labs(
    subtitle = glue("Histogram; binwidth: {my_binwidth} years")
  )

p2 <- dta_for_plot |>
  ggplot() +
  geom_density(aes(years_since_last_mod, color = mod_idx_label),
               na.rm = TRUE) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.04))) +
  scale_y_continuous(expand = expansion(mult = c(0.01, 0.04))) +
  scale_color_viridis_d(end = 0.9) +
  guides(color = guide_legend(position = "inside")) +
  theme(legend.position.inside = c(0.8, 0.7)) +
  coord_cartesian(x = c(0, 150)) +
  labs(
    subtitle = "Density plot",
    color = NULL
  )

p1 + p2 +
  plot_annotation(
    title = glue("For mods 2+ there is a similar distributions of years\n since last modification"),
    subtitle = glue("Includes {n_dams_with_good_mod_data} ({percent(pct_dams_with_good_mod_data, accuracy = 1)}) dams with modification data"),
    caption = my_caption
  )

Figure 6.27: Distribution of years since last modificaiton

6.6.1 Reasons for modification

Big dams are more frequently modified for structural or seismic reasons (Figure 6.28).

Show the code

dta_for_plot_reasons <- dta_for_plot |>
  filter(!is.na(modified_reason)) |>
  mutate(mod_reason_label = case_match(
    modified_reason,
    "F" ~ "Foundation",
    "H" ~ "Hydraulic",
    "M" ~ "Mechanical",
    "E" ~ "Seismic",
    "S" ~ "Structural",
    "O" ~ "Other",
  )) |>
  count(mod_reason_label, sort = TRUE) |>
  mutate(mod_reason_label = fct_reorder(mod_reason_label, n),
         n_pct = `n` / sum(`n`))

p1 <- dta_for_plot_reasons |>
  ggplot(aes(n, mod_reason_label)) +
  geom_col() +
  geom_text(aes(x = 0.9 * max(n), label = glue("{n} | {percent(n_pct, accuracy = 1)}")),
            hjust = 1,
            size = 3) +
  scale_x_continuous(expand = expansion(mult = c(0.01, 0.04))) +
  # scale_y_continuous(expand = expansion(mult = c(0.01, 0.04))) +
  labs(
    subtitle = "A: All dams",
    x = NULL,
    y = NULL
  )

dta_for_plot_reasons_big <- dta_for_plot |>
  filter(!is.na(modified_reason),
         big_dam) |>
  mutate(mod_reason_label = case_match(
    modified_reason,
    "F" ~ "Foundation",
    "H" ~ "Hydraulic",
    "M" ~ "Mechanical",
    "E" ~ "Seismic",
    "S" ~ "Structural",
    "O" ~ "Other",
  )) |>
  count(mod_reason_label, sort = TRUE) |>
  mutate(mod_reason_label = fct_reorder(mod_reason_label, n),
         n_pct = `n` / sum(`n`))

p2 <- dta_for_plot_reasons_big |>
  ggplot(aes(n, mod_reason_label)) +
  geom_col() +
  geom_text(aes(x = 0.9 * max(n), label = glue("{n} | {percent(n_pct, accuracy = 1)}")),
            hjust = 1,
            size = 3) +
  scale_x_continuous(expand = expansion(mult = c(0.01, 0.04))) +
  # scale_y_continuous(expand = expansion(mult = c(0.01, 0.04))) +
  labs(
    subtitle = "B: Big dams",
    x = NULL,
    y = NULL
  )

p1 + p2 +
  plot_annotation(
    title = "Reasons for dam modifications",
    caption = my_caption
  )

Figure 6.28: Reasons for dam modification

Is the modification data complete? Unlikely, especially in earlier years and for smaller dams. Beyond that intuition, I cannot assess what proportion of modifications might be missing. Of the 86,351 dams, 6,514 have associated modification dates. 5,436 have valid modification dates.

6.7 State-level information

Below is state-level information, in case you wish to dig in.

Show the code

p1 <- dams_no_geo |>
  count(state, primaryOwnerTypeId) |>
  mutate(state = fct_reorder(state, n, sum),
         primaryOwnerTypeId = fct_reorder(primaryOwnerTypeId, n, sum)) |>
  ggplot() +
  geom_col(aes(x = n, y = state, fill = primaryOwnerTypeId),
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  guides(fill = guide_legend(reverse = TRUE,
                             ncol = 2)) +
  theme(legend.position = "top")+
  labs(
    subtitle = "A: Primary owner",
    x = NULL,
    y = NULL,
    fill = NULL
  )

p2 <- dams_no_geo |>
  count(state, primaryPurposeId) |>
  mutate(state = fct_reorder(state, n, sum),
         primaryPurposeId = fct_reorder(primaryPurposeId, n, sum)) |>
  ggplot() +
  geom_col(aes(x = n, y = state, fill = primaryPurposeId),
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  guides(fill = guide_legend(reverse = TRUE,
                             ncol = 2)) +
  theme(legend.position = "top")+
  labs(
    subtitle = "B: Primary purpose",
    x = NULL,
    y = NULL,
    fill = NULL
  )

p3 <- dams_no_geo |>
  count(state, primaryDamTypeId) |>
  mutate(state = fct_reorder(state, n, sum),
         primaryDamTypeId = fct_reorder(primaryDamTypeId, n, sum)) |>
  ggplot() +
  geom_col(aes(x = n, y = state, fill = primaryDamTypeId),
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  guides(fill = guide_legend(reverse = TRUE,
                             ncol = 2)) +
  theme(legend.position = "top") +
  labs(
    subtitle = "C: Primary dam type",
    x = NULL,
    y = NULL,
    fill = NULL
  )

p1 + p2 + p3 +
  plot_annotation(
    title = "Basic counts: owner, purpose, dam type",
    caption = my_caption
  )

Figure 6.29: Basic counts: owner, purpose, dam type - by state

Show the code

p1 <- dams_no_geo |>
  count(state, primaryOwnerTypeId) |>
  mutate(state = fct_reorder(state, n, sum),
         primaryOwnerTypeId = fct_reorder(primaryOwnerTypeId, n, sum)) |>
  ggplot() +
  geom_col(aes(x = n, y = state, fill = primaryOwnerTypeId),
           position = "fill",
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  guides(fill = guide_legend(reverse = TRUE,
                             ncol = 2)) +
  theme(legend.position = "top")+
  labs(
    subtitle = "A: Primary owner",
    x = NULL,
    y = NULL,
    fill = NULL
  )

p2 <- dams_no_geo |>
  count(state, primaryPurposeId) |>
  mutate(state = fct_reorder(state, n, sum),
         primaryPurposeId = fct_reorder(primaryPurposeId, n, sum)) |>
  ggplot() +
  geom_col(aes(x = n, y = state, fill = primaryPurposeId),
           position = "fill",
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  guides(fill = guide_legend(reverse = TRUE,
                             ncol = 2)) +
  theme(legend.position = "top")+
  labs(
    subtitle = "B: Primary purpose",
    x = NULL,
    y = NULL,
    fill = NULL
  )

p3 <- dams_no_geo |>
  count(state, primaryDamTypeId) |>
  mutate(state = fct_reorder(state, n, sum),
         primaryDamTypeId = fct_reorder(primaryDamTypeId, n, sum)) |>
  ggplot() +
  geom_col(aes(x = n, y = state, fill = primaryDamTypeId),
           position = "fill",
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  guides(fill = guide_legend(reverse = TRUE,
                             ncol = 2)) +
  theme(legend.position = "top")+
  labs(
    subtitle = "C: Primary dam type",
    x = NULL,
    y = NULL,
    fill = NULL
  )

p1 + p2 + p3 +
  plot_annotation(
    title = "Basic counts: owner, purpose, dam type (proportion of each category)",
    caption = my_caption
  )

Figure 6.30: Basic counts: owner, purpose, dam type (proportion of each category) - by state

Show the code

state_owner <- dams |>
  st_drop_geometry() |>
  count(primaryOwnerTypeId, state) |>
  complete(primaryOwnerTypeId, state, fill = list(n = 0)) |>
  mutate(primaryOwnerTypeId = fct_reorder(primaryOwnerTypeId, -n, sum),
         state = fct_reorder(state, n, sum))

p1 <- state_owner |>
  ggplot(aes(x = primaryOwnerTypeId, y = state)) +
  geom_tile(aes(fill = n),
            alpha = 0.98) +
  geom_text(aes(label = n),
            color = "white",
             size = 3) +
  theme(plot.title.position = "plot",
        axis.text.x = element_text(angle = 60, hjust = 0)
        ) +
  scale_x_discrete(position = "top") +
  scale_fill_continuous(transform = scales::transform_log1p(),
                        breaks = c(0, 10, 100, 1000, 10000)) +
  guides(fill = "none") +
  labs(
    subtitle = "A: Primary owner type - count",
    x = NULL,
    y = NULL
  )

state_purpose <- dams |>
  st_drop_geometry() |>
  count(primaryPurposeId, state) |>
  complete(primaryPurposeId, state, fill = list(n = 0)) |>
  mutate(primaryPurposeId = fct_reorder(primaryPurposeId, -n, sum),
         state = fct_reorder(state, n, sum))

p2 <- state_purpose |>
  ggplot(aes(x = primaryPurposeId, y = state)) +
  geom_tile(aes(fill = n),
            alpha = 0.98) +
  geom_text(aes(label = n),
            color = "white",
             size = 3) +
  theme(plot.title.position = "plot",
        axis.text.x = element_text(angle = 60, hjust = 0)
        ) +
  scale_x_discrete(position = "top") +
  scale_fill_continuous(transform = scales::transform_log1p(),
                        breaks = c(0, 10, 100, 1000, 10000),
                        ) + # transform_sqrt()) +
  labs(
    subtitle = "B: Primary purpose - count",
    x = NULL,
    y = NULL
  )

p1 + p2 +
  plot_annotation(
    title = "State by primary owner and primary purpose",
    caption = my_caption
  )

Figure 6.31: State by primary owner type and purpose - counts

Show the code

p1 <- dams_ratios_per_state |>
  ggplot() +
  geom_col(aes(x = n_dams, y = state),
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  labs(
    subtitle = "A: Number of dams",
    x = NULL,
    y = NULL
  )

p2 <- dams_ratios_per_state_big |>
  ggplot() +
  geom_col(aes(x = n_dams, y = state),
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  labs(
    subtitle = "B: Number of big dams",
    x = NULL,
    y = NULL
  )

p1 + p2 +
  plot_annotation(
    title = glue("Number of dams"),
    # subtitle = glue("Includes {n_dams_with_good_mod_data} ({percent(pct_dams_with_good_mod_data, accuracy = 1)}) dams with modification data"),
    caption = my_caption
  )

Show the code

p1 <- dams_ratios_per_state |>
  mutate(state = fct_reorder(state, pop_per_dam)) |>
  ggplot() +
  geom_col(aes(x = pop_per_dam, y = state),
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  labs(
    subtitle = "A: Population per dam",
    x = NULL,
    y = NULL
  )

p2 <- dams_ratios_per_state_big |>
  mutate(state = fct_reorder(state, pop_per_dam)) |>
  ggplot() +
  geom_col(aes(x = pop_per_dam, y = state),
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  labs(
    subtitle = "B: Population per big dam",
    x = NULL,
    y = NULL
  )

p1 + p2 +
  plot_annotation(
    title = glue("Population per dam"),
    caption = my_caption
  )

Show the code

p1 <- dams_ratios_per_state |>
  mutate(state = fct_reorder(state, sqmi_per_dam)) |>
  ggplot() +
  geom_col(aes(x = sqmi_per_dam, y = state),
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  labs(
    subtitle = "A: All dams",
    x = NULL,
    y = NULL
  )

p2 <- dams_ratios_per_state_big |>
  mutate(state = fct_reorder(state, sqmi_per_dam)) |>
  ggplot() +
  geom_col(aes(x = sqmi_per_dam, y = state),
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  labs(
    subtitle = "B: Big dams",
    x = NULL,
    y = NULL
  )

p1 + p2 +
  plot_annotation(
    title = glue("State sqmi per dam"),
    caption = my_caption
  )

Show the code

dta_for_plot <- dams_with_pop |>
  reframe(n_dams = n(),
         median_surface_area_to_nidStorage = median(surfaceArea / nidStorage, na.rm = TRUE),
         avg_surface_area_to_nidStorage = mean(surfaceArea / nidStorage, na.rm = TRUE),
         .by = "state"
  ) |>
  mutate(state_label = glue("{state} ({n_dams})"),
         state_label = fct_reorder(state_label, median_surface_area_to_nidStorage))

n_dams_total = sum(dta_for_plot$n_dams)

p1 <- dta_for_plot |>
  ggplot() +
  geom_col(aes(x = median_surface_area_to_nidStorage, 
               width = pmax(15 * n_dams / n_dams_total, 0.02), 
               y = state_label),
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  theme(axis.ticks.y = element_blank()) +
  labs(
    subtitle = "A: All dams",
    x = NULL,
    y = NULL
  )

dta_for_plot_big <- dams_with_pop |>
  filter(big_dam) |>
  reframe(n_dams = n(),
         median_surface_area_to_nidStorage = median(surfaceArea / nidStorage, na.rm = TRUE),
         avg_surface_area_to_nidStorage = mean(surfaceArea / nidStorage, na.rm = TRUE),
         .by = "state"
  ) |>
  mutate(state_label = glue("{state} ({n_dams})"),
         state_label = fct_reorder(state_label, median_surface_area_to_nidStorage))

n_dams_total_big = sum(dta_for_plot_big$n_dams)

p2 <- dta_for_plot_big |>
  ggplot() +
  geom_col(aes(x = median_surface_area_to_nidStorage,
               width = pmax(8 * n_dams / n_dams_total_big, 0.02),
               y = state_label),
           alpha = 0.7) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.02))) +
  scale_fill_viridis_d(end =  0.9,
                       direction = -1) +
  theme(axis.ticks.y = element_blank()) +
  labs(
    subtitle = "B: Big dams",
    x = NULL,
    y = NULL
  )

p1 + p2 +
  plot_annotation(
    title = glue("Median dam `surfaceArea`::`nidStorage` by state"),
    subtitle = "Vertical height reflects the relative number of dams in the state",
    caption = my_caption
  )

Figure 6.34: Median `surfaceArea`::`nidStorage`

I’m surprised Florida is so low in the ranking in panel A, given it’s such a flat state. Also that it has 71 big dams (panel B).

Looking deeper #1: all the big dams are privately owned by companies mining phosphorus, potash, etc. and nearly all have “tailings” as their primary purpose. (They have smaller dams in Florida too.)

Show the code

dams_no_geo |>
  filter(big_dam,
         state_abb == "FL") |>
  select(nidId, name, ownerNames, primaryPurposeId, nidHeight, nidStorage) |>
  reframe(
    n_dams = n(),
    `nidStorage median` = median(nidStorage),
    `nidStorage smallest` = min(nidStorage),
    `nidStorage largest` = max(nidStorage),
    `nidHeight median` = median(nidHeight),
    `nidHeight smallest` = min(nidHeight),
    `nidHeight largest` = max(nidHeight),
    .by = ownerNames
  ) |>
  arrange(desc(`nidStorage median`), `nidHeight median`) |>
  gt() |>
  tab_header(md("**Big dams in Florida are owned by mining companies**")) |>
  fmt_number(columns = is.numeric,
             decimals = 0)

Big dams in Florida are owned by mining companies
ownerNames	n_dams	nidStorage median	nidStorage smallest	nidStorage largest	nidHeight median	nidHeight smallest	nidHeight largest
PCS PHOSPHATE	3	52,500	52,500	52,500	105	105	105
U.S. AGRI - CHEM	2	1,768	1,178	2,358	142	135	149
THE MOSAIC COMPANY	60	1,152	128	16,000	105	103	145
HRK HOLDINGS, LLC	6	960	896	1,920	105	105	105

Looking deeper #2: Regarding dams with the highest nidStorage_median owned by PCS PHOSPHATE (a subsidiary of Nutrien):

It’s suspicious that median, smallest, and largest are all the same size – are the entries duplicates, or are there data errors? It’s very unlikely that three dams would manage exactly the same amount of water or be exactly the same height.
Each dam has it’s own name and nidId, and long/lat coordinates are different.
I verified the dams on Google maps: there are three separate holding ponds very close to each other, located in North Florida about halfway between Tallahassee and Jacksonville.
Thus is seems some of the data fields were duplicated, but not all:

Show the code

dams_no_geo |>
  filter(big_dam,
         state_abb == "FL",
         ownerNames == "PCS PHOSPHATE") |>
  select(nidId, name, ownerNames, `primary PurposeId` = primaryPurposeId, nidHeight, nidStorage, `surface Area` = surfaceArea, longitude, latitude) |>
  arrange(nidId) |>
  gt() |>
  tab_header(md("**PCS PHOSPHATE's big dams seem to have some duplicated data in the NID**")) |>
  fmt_number(columns = starts_with("nid"),
             decimals = 0)

PCS PHOSPHATE’s big dams seem to have some duplicated data in the NID
nidId	name	ownerNames	primary PurposeId	nidHeight	nidStorage	surface Area	longitude	latitude
FL10002	Ctc Gypsum	PCS PHOSPHATE	Other or not specified	105	52,500	500	-82.79000	30.42997
FL10004	Sc Gypsum	PCS PHOSPHATE	Other or not specified	105	52,500	500	-82.87461	30.44806
FL10009	Do Gypsum	PCS PHOSPHATE	Tailings	105	52,500	500	-82.77978	30.43331

Or in some cases, dams provide surface impoundment of mine tailings or coal ash. See for example Fact Sheet: Coal Ash. EPA 530-F-23-004. June 2023 at https://www.epa.gov/system/files/documents/2023-06/CCR_Fact_Sheet_June_2023.pdf ↩︎
https://en.wikipedia.org/wiki/Erie_Canal ↩︎
National Inventory of Dams Data Dictionary https://usace-cwbi-prod-il2-nld2-docs.s3-us-gov-west-1.amazonaws.com/ec51e2ba-daff-4dbe-95eb-af13b91066ba/NID%20Data%20Dictionary%202021-12-14.pdf ↩︎