2  Inspection failures

Inspection failure rates in 2022 for (model_year, brand, model) vary from 0% to 67%:

Show the code
dta_working_set |>
  filter(failure_rate == min(failure_rate) | failure_rate == max(failure_rate)) |>
  select(model_year, brand, model, inspection_count, failure_rate, failure_reason_1) |>
  gt()
Table 2.1: Minimum and maximum falure rates considering all (model_year, brand, model)
model_year brand model inspection_count failure_rate failure_reason_1
2009 Dodge CALIBER 171 0.6725 Rear axle
2017 Suzuki SWIFT 172 0.0000 Not provided


Here I start using vehicle_age instead of model_year, because it’s a more natural way of thinking about the association with failure rate.

While there are strong correlations among model year, km driven (average or median), and failure rate (Figure 2.1 and Table 2.2), keep in mind that differences among groups in model_year (also expressed as vehicle_age) and median_km_driven mean that summary statistics and rankings may be misleading. See Chapter 3 Modeling failure rate.

Show the code
data_for_plot <- dta_working_set

model_year_range <- glue("{min(data_for_plot$model_year)} - {max(data_for_plot$model_year)}")

data_for_plot |>
  pivot_longer(cols = inspection_count:median_km_driven,
               names_to = "variable",
               values_to = "value") |>
  mutate(variable = fct_relevel(variable, c("average_km_driven", "median_km_driven",
                                           "failure_rate", "inspection_count"))) |>
  ggplot() +
  geom_jitter(aes(vehicle_age, value),
             size = 0.25, alpha = 0.25, na.rm = TRUE) +
  geom_smooth(aes(vehicle_age, value),
              method = "loess", formula = 'y ~ x',
              na.rm = TRUE
              ) +
  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.05))) +
  expand_limits(y = 0) +
  facet_wrap(~ variable, scales = "free_y") +
  labs(
    title = "Four views (all brand summary)",
    subtitle = glue("Y axis varies; model years {model_year_range}"),
    x = "Vehicle age",
    y = NULL,
    caption = my_caption
  )

Figure 2.1: Four views of inspection failure data (all brand summary)


Show the code
dta_working_set |>
  select(vehicle_age, median_km_driven, average_km_driven, failure_rate) |>
  cor() |>
  as.data.frame() |>
  rownames_to_column(var = "variables") |>
  gt() |>
  tab_header(md("**Correlations**")) |>
  fmt_number(decimals = 3) |>
  tab_source_note(md("*cor(vehicle_age, median_km_driven, average_km_driven, failure_rate)*"))
Table 2.2: High correlations
Correlations
variables vehicle_age median_km_driven average_km_driven failure_rate
vehicle_age 1.000 0.708 0.700 0.747
median_km_driven 0.708 1.000 0.996 0.595
average_km_driven 0.700 0.996 1.000 0.591
failure_rate 0.747 0.595 0.591 1.000
cor(vehicle_age, median_km_driven, average_km_driven, failure_rate)


This is generally true at the brand level as well (Figure 2.2).

Show the code
data_for_plot <- all_models_yearly |>
  mutate(model_year = as.numeric(model_year),
         vehicle_age = 2022 - model_year) |>
  filter(brand != "All brands") |>
  complete(model_year, brand,
           fill = list(inspection_count = 0, model = "All models")
  )

model_year_range <- glue("{min(data_for_plot$model_year)} - {max(data_for_plot$model_year)}")

data_for_plot |>
  pivot_longer(cols = c(inspection_count, failure_rate, average_km_driven, median_km_driven),
               names_to = "variable",
               values_to = "value") |>
  mutate(variable = fct_relevel(variable, c("average_km_driven", "median_km_driven",
                                           "failure_rate", "inspection_count"))) |>
  ggplot() +
  geom_line(aes(vehicle_age, value, group = brand),
            linewidth = 0.25, alpha = 0.4,
            na.rm = TRUE, show.legend = FALSE) +
  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.05))) +
  expand_limits(y = 0) +
  facet_wrap(~ variable, scales = "free_y") +
  labs(
    title = "Brands generally follow summary trends",
    subtitle = glue("Y axis varies; model years {model_year_range}"),
    x = "Vehicle age",
    y = NULL,
    caption = my_caption
  )

Figure 2.2: Four views of inspection failure data (brands)


The most linear relationship is failure rate by vehicle age (Figure 2.3 panel B), which helps to explain the difference between the corresponding simple linear models (Table 3.2).

Show the code
data_for_plot <- dta_working_set |>
  mutate(median_km_driven_k = median_km_driven / 1000)

p1 <- data_for_plot |>
  ggplot(aes(median_km_driven, failure_rate)) +
  geom_point(aes(color = model_year),
            na.rm = TRUE, alpha = 0.4) +
  geom_smooth(data = data_for_plot |>
                filter(median_km_driven < 300000),
              aes(median_km_driven, failure_rate),
              method = 'loess', formula = 'y ~ x',
              se = FALSE, linewidth = 1.75, color = "white") +
  geom_smooth(data = data_for_plot |>
                filter(median_km_driven < 300000),
              aes(median_km_driven, failure_rate),
              method = 'loess', formula = 'y ~ x',
              se = FALSE, linewidth = 1, color = "blue") +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale())) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  scale_color_viridis_c(direction = 1) +
  labs(
    tag = "A"
  )

p2 <- data_for_plot |>
  ggplot(aes(vehicle_age, failure_rate)) +
  geom_jitter(aes(color = median_km_driven_k),
            na.rm = TRUE, alpha = 0.4) +
  geom_smooth(method = 'loess', formula = 'y ~ x',
              se = FALSE, linewidth = 1.75, color = "white") +
  geom_smooth(method = 'loess', formula = 'y ~ x',
              se = FALSE, linewidth = 1, color = "blue") +
  scale_color_viridis_c(direction = 1)  +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  expand_limits(y = 0) +
  labs(
    tag = "B"
  )

p3 <- data_for_plot |>
  ggplot(aes(vehicle_age, median_km_driven)) +
  geom_jitter(aes(color = failure_rate),
            na.rm = TRUE, alpha = 0.4) +
  geom_smooth(method = 'loess', formula = 'y ~ x',
              se = FALSE, linewidth = 1.75, color = "white") +
  geom_smooth(method = 'loess', formula = 'y ~ x',
              se = FALSE, linewidth = 1, color = "blue") +
  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0, 0.05))) +
  scale_color_viridis_c(direction = 1) +
  expand_limits(y = 0) +
  labs(
    tag = "C"
  )

p1 + p2 + p3 +
  plot_annotation(
    title = "Vehicle inspection failures",
    caption = my_caption
  )

Figure 2.3: Inspection failure rates by distance driven and vehicle age


Linear regression plots by brand reveal differences in the failure rates of the brands:

Show the code
data_for_plot <- dta_working_set

model_year_range <- glue("{min(data_for_plot$model_year)} - {max(data_for_plot$model_year)}")

data_for_plot |>
  ggplot() +
  geom_point(aes(median_km_driven, failure_rate, group = brand, color = vehicle_age, size = inspection_count),
             na.rm = TRUE, size = 0.5, alpha = 0.4,
             show.legend = TRUE) +
  geom_smooth(aes(median_km_driven, failure_rate, group = brand),
              method = "lm", formula = 'y ~ x', se = FALSE, linewidth = 0.5,
              show.legend = FALSE) +
  scale_x_continuous(labels = label_number(scale_cut = cut_short_scale())) +
  scale_color_viridis_c(end = 0.85,
                        breaks = 3 * 1:5) + #c(4, 8, 12, 16)) +
  scale_size_continuous(range = c(1, 10)) +
  expand_limits(y = 0) +
  facet_wrap(~ brand) +
  theme(legend.position = "bottom") +
  labs(
    title = "Vehicle inspection failure percentage by median km driven",
    subtitle = glue("All brands and models; model years {model_year_range}"),
    x = "Median km driven",
    y = "Failure rate",
    caption = my_caption
  )

Figure 2.4: Inspection failures by distance driven faceted by brand


Considering vehicles of each model year as a group, what was the failure rate for each brand? The same brands are not in the top or bottom five each year although some brands do appear often (Figure 2.5).

Show the code
data_for_plot <- dta_working_set

model_year_range <- glue("{min(data_for_plot$model_year)} - {max(data_for_plot$model_year)}")

data_for_plot |>
  mutate(brand = reorder_within(brand, -failure_rate, model_year)) |>
  ggplot(aes(failure_rate, brand, group = brand, color = brand)) +
  geom_boxplot(varwidth = TRUE,
             na.rm = TRUE,
             show.legend = FALSE) +
  scale_y_reordered() +
  expand_limits(y = 0) +
  facet_wrap(~ model_year,scales = "free_y") +
  theme(legend.position = "bottom") +
  labs(
    title = "Ranked brands by failure rate for each model year",
    subtitle = glue("All brands and models; model years {model_year_range}"),
    x = "Failure rate",
    y = NULL,
    caption = my_caption
  )

Figure 2.5: Ranked brands by failure rate boxplot faceted by model year


Considering vehicles of each brand over all years, what are the brands’ failure rate trends (Figure 2.6)?

Show the code
data_for_plot <- dta_working_set

model_year_range <- glue("{min(data_for_plot$model_year)} - {max(data_for_plot$model_year)}")

data_for_plot |>
  ggplot(aes(vehicle_age, failure_rate, color = model_year, group = model_year)) +
  geom_boxplot(varwidth = TRUE,
               outlier.size = 0.5,
             na.rm = TRUE,
             show.legend = FALSE) +
  scale_color_viridis_c(end = 0.85) +
  facet_wrap(~ brand) + 
  theme(legend.position = "bottom") +
  labs(
    title = "Ranked brands by failure rate for each model year",
    subtitle = glue("All brands and models with at least 3 model years; model years {model_year_range}"),
    x = "Vehicle age",
    y = "Failure rate",
    caption = my_caption
  )

Figure 2.6: Failure rate boxplot by model year for each brand


Failure rates vary to a surprising amount within most brands when plotted against vehicle age (Figure 2.7).

Show the code
data_for_plot <- dta_working_set

model_year_range <- glue("{min(data_for_plot$model_year)} - {max(data_for_plot$model_year)}")

data_for_plot |>
  ggplot(aes(vehicle_age, failure_rate, color = model, group = model)) +
  geom_point(size = 0.5, alpha = 0.4,
             show.legend = FALSE) +
  geom_smooth(method = "lm", formula = 'y ~ x', se = FALSE, linewidth = 0.5,
              show.legend = FALSE) +
  facet_wrap( ~ brand) +
  theme(legend.position = "bottom") +
  labs(
    title = "Vehicle inspection failure percentage by vehicle age for each model",
    subtitle = glue("All brands and models with at least 3 model years; model years {model_year_range}"),
    x = "Vehicle age",
    y = "Failure rate",
    caption = my_caption
  )

Figure 2.7: Failure rates for each model by vehicle age faceted by brand


The differences in Figure 2.7 within brands are due in part to some models being driven more than others. The differences within brands (while still noticeable) are not as large when plotted against distance driven (Figure 2.8).

Show the code
data_for_plot <- dta_working_set  |>
  mutate(median_km_driven_k = median_km_driven / 1000)

my_breaks <- seq(from = 0, to = max(data_for_plot$median_km_driven_k), by = 100)

model_year_range <- glue("{min(data_for_plot$model_year)} - {max(data_for_plot$model_year)}")

data_for_plot |>
  ggplot(aes(median_km_driven_k, failure_rate, color = model, group = model)) +
  geom_point(size = 0.5, alpha = 0.4,
             show.legend = FALSE) +
  geom_smooth(method = "lm", formula = 'y ~ x', se = FALSE, linewidth = 0.5,
              show.legend = FALSE) +
  scale_x_continuous(breaks = my_breaks) +
  facet_wrap( ~ brand) +
  theme(legend.position = "bottom") +
  labs(
    title = "Vehicle inspection failure percentage by model_year by median km driven",
    subtitle = glue("All brands and models with at least 3 model years; model years {model_year_range}"),
    x = "Median km driven (K)",
    y = "Failure rate",
    caption = my_caption
  )

Figure 2.8: Failure rates for each model by distance driven faceted by brand