4  Failure reasons

The data set includes the most common reasons that inspections failed for most (model_year, brand, model). Some (model_year, brand, model) include the second-most and third-most common reasons. To be included as a first, second or third most common reason, it must have been recorded for at least 10% of the vehicles in that (model_year, brand, model). I use vehicle_age in place of model_year below.

4.1 Most common reasons

Show the code
dta_reason_1_overall <- dta_working_set |>
  count(failure_reason_1, sort = TRUE, name = "n_failures") |>
  mutate(rank = row_number()) |>
  left_join(reasons_df,
            join_by(failure_reason_1 == reason)) |>
  mutate(reason = coalesce(reason_short, failure_reason_1)) |>
  select(reason, n_failures, rank)

dta_reason_1_overall_yearly <- dta_working_set |>
  mutate(vehicle_age = 2022 - model_year) |>
  summarize(n_failures = n(),
            .by = c(vehicle_age, failure_reason_1)
            ) |>
  arrange(vehicle_age, desc(n_failures)) |>
  mutate(rank = row_number(),
         .by = vehicle_age
         ) |>
  left_join(reasons_df,
            join_by(failure_reason_1 == reason)) |>
  mutate(reason = coalesce(reason_short, failure_reason_1)) |>
  select(vehicle_age, reason, n_failures, rank)

dta_reason_1_overall_weighted <- dta_working_set |>
  mutate(n_failures = round(inspection_count * failure_rate)) |>
  count(failure_reason_1, sort = TRUE, wt = n_failures, name = "n_failures") |>
  mutate(rank = row_number()) |>
  left_join(reasons_df,
            join_by(failure_reason_1 == reason)) |>
  mutate(reason = coalesce(reason_short, failure_reason_1)) |>
  select(reason, n_failures, rank)
  
dta_reason_1_overall_yearly_weighted <- dta_working_set |>
  mutate(vehicle_age = 2022 - model_year,
         n_failures = round(inspection_count * failure_rate)) |>
  count(vehicle_age, failure_reason_1, wt = n_failures, name = "n_failures") |>
  arrange(vehicle_age, desc(n_failures)) |>
  mutate(rank = row_number(),
         .by = vehicle_age
         ) |>
  left_join(reasons_df,
            join_by(failure_reason_1 == reason)) |>
  mutate(reason = coalesce(reason_short, failure_reason_1)) |>
  select(vehicle_age, reason, n_failures, rank)

dta_reasons_all_weighted <- dta_working_set  |>
  pivot_longer(cols = starts_with("failure_reason"),
               names_to = "failure_number",
               values_to = "reason") |>
  filter(reason != "All model years", # found in failure_reason_2; not helpful
         failure_number == "failure_reason_1" |
           (failure_number == "failure_reason_2" & reason != "Not provided") |
           (failure_number == "failure_reason_3" & reason != "Not provided")
  ) |>
  left_join(reasons_df,
            join_by(reason == reason)) |>
  mutate(reason = coalesce(reason_short, reason)) |>
  mutate(failure_rate_weight = case_match(
    failure_number,
    "failure_reason_1" ~ 1.0,
    "failure_reason_2" ~ 0.35,
    "failure_reason_3" ~ 0.15),
    n_failures = round(failure_rate_weight * inspection_count * failure_rate)
  ) |>
  #count(reason, sort = TRUE, wt = n_failures) |>
  summarize(n_failures = sum(n_failures),
            .by = reason) |>
  arrange(desc(n_failures)) |>
  mutate(rank = row_number())

dta_reasons_all_yearly_weighted <- dta_working_set |>
  pivot_longer(cols = starts_with("failure_reason"),
               names_to = "failure_number",
               values_to = "reason") |>
  filter(reason != "All model years", # found in failure_reason_2; not helpful
         failure_number == "failure_reason_1" |
           (failure_number == "failure_reason_2" & reason != "Not provided") |
           (failure_number == "failure_reason_3" & reason != "Not provided")
  ) |>
  left_join(reasons_df,
            join_by(reason == reason)) |>
  mutate(reason = coalesce(reason_short, reason)) |>
  mutate(vehicle_age = 2022 - model_year,
         failure_rate_weight = case_match(
           failure_number,
           "failure_reason_1" ~ 1.0,
           "failure_reason_2" ~ 0.35,
           "failure_reason_3" ~ 0.15),
         n_failures = round(failure_rate_weight * inspection_count * failure_rate)
  ) |>
  count(vehicle_age, reason, sort = TRUE, wt = n_failures, name = "n_failures") |>
  mutate(rank = row_number(),
         .by = vehicle_age
  )

subset_by_brand_dta_most_common_reasons_yearly_weighted <- function(tbl = dta_working_set,
                                                                    mybrand = "") {
  
  #test
  # tbl <- dta_working_set
  # mybrand <- "VW"
  
  if(length(mybrand) > 1 || mybrand != "") {
    mytbl <- tbl |>
      filter(brand %in% mybrand)
  } else {
    mytbl <- tbl
  }
  
  if(nrow(mytbl) == 0 ) {
    stop("subset_by_brand_dta_reasons_all_yearly_weighted(): mybrand string(s) not found in dataframe")
  }
  
  mytbl |>
    # pivot_longer(cols = starts_with("failure_reason"),
    #              names_to = "failure_number",
    #              values_to = "reason") |>
    # filter(reason != "All model years", # found in failure_reason_2; not helpful
    #      failure_number == "failure_reason_1" |
    #        (failure_number == "failure_reason_2" & reason != "Not provided") |
    #        (failure_number == "failure_reason_3" & reason != "Not provided")
    rename(reason = failure_reason_1) |>
    left_join(reasons_df,
              join_by(reason == reason)) |>
    mutate(reason = coalesce(reason_short, reason)) |>
    mutate(vehicle_age = 2022 - model_year,
           # failure_rate_weight = case_match(
           #   failure_number,
           #   "failure_reason_1" ~ 1.0,
           #   "failure_reason_2" ~ 0.35,
           #   "failure_reason_3" ~ 0.15),
           n_failures = round(inspection_count * failure_rate)
    ) |>
    count(vehicle_age, reason, sort = TRUE, wt = n_failures, name = "n_failures") |>
    mutate(rank = row_number(),
           .by = vehicle_age
    )
  
}

subset_by_brand_dta_reasons_all_yearly_weighted <- function(tbl = dta_working_set,
                                                   mybrand = "") {
  
  #test
  # tbl <- dta_working_set
  # mybrand <- "VW"
  
  if(length(mybrand) > 1 || mybrand != "") {
    mytbl <- tbl |>
      filter(brand %in% mybrand)
  } else {
    mytbl <- tbl
  }
  
  if(nrow(mytbl) == 0 ) {
    stop("subset_by_brand_dta_reasons_all_yearly_weighted(): mybrand string(s) not found in dataframe")
  }
  
  mytbl |>
    pivot_longer(cols = starts_with("failure_reason"),
                 names_to = "failure_number",
                 values_to = "reason") |>
    filter(reason != "All model years", # found in failure_reason_2; not helpful
         failure_number == "failure_reason_1" |
           (failure_number == "failure_reason_2" & reason != "Not provided") |
           (failure_number == "failure_reason_3" & reason != "Not provided")
  ) |>
    left_join(reasons_df,
              join_by(reason == reason)) |>
    mutate(reason = coalesce(reason_short, reason)) |>
    mutate(vehicle_age = 2022 - model_year,
           failure_rate_weight = case_match(
             failure_number,
             "failure_reason_1" ~ 1.0,
             "failure_reason_2" ~ 0.35,
             "failure_reason_3" ~ 0.15),
           n_failures = round(failure_rate_weight * inspection_count * failure_rate)
    ) |>
    count(vehicle_age, reason, sort = TRUE, wt = n_failures, name = "n_failures") |>
    mutate(rank = row_number(),
           .by = vehicle_age
    )
  
}

In the tables below (Table 4.1) I rank reasons by the number of inspection failures reported having this reason in two ways: (a) considering only the most common reason for each (model_year, brand, model); and (b) also including second and third most common reasons when they are available. I estimate the rate of second and third-most common reasons at 35% and 15% respectively. Since the order of the reasons are quite similar, I use (a) “most common reasons” in the plots in this chapter, since it’s easier to understand.

Show the code
##| column: page

dta_reason_1_overall_weighted |>
  gt()
dta_reasons_all_weighted |>
  gt()

Table 4.1: Reasons for inspection failure: all ages, brands and models

(a) Most common reasons
reason n_failures rank
Suspension 68320 1
Front axle 66715 2
P-brake test 18165 3
Rear axle 16051 4
Steering 14219 5
Brakes 12097 6
Chassis etc 6423 7
Not provided 5017 8
Brake test 4154 9
Factual docs 2990 10
OBD 1305 11
Petrol Exhaust 1045 12
Tyres and rims 710 13
Mfg plate 39 14
(b) All reasons
reason n_failures rank
Suspension 86358 1
Front axle 82700 2
P-brake test 33499 3
Brakes 29292 4
Steering 23045 5
Rear axle 21526 6
Brake test 11960 7
Chassis etc 8201 8
Not provided 5017 9
Factual docs 3934 10
OBD 3521 11
Tyres and rims 2528 12
Petrol Exhaust 2297 13
Diesel Exhaust 477 14
Mfg plate 159 15
Bodywork 28 16
Headlamp 26 17
Safety equip 20 18
Registr markings 6 19
Stability control 6 20


4.2 Most common reasons by vehicle age

As vehicles age and are driven farther, there are more inspection failures, and the number of most common reasons for failure increases.

Problems with the front axle and suspension are the most common over most years (Figure 4.1). As vehicles age there are more problems, so while from about age 10 there are about 9K vehicles failing with these two reason as the most common (panel A), the percentage of failures of these two reasons goes down panel B).

Show the code
data_for_plot <- dta_reason_1_overall_yearly_weighted |>
  mutate(pct_of_failures = n_failures / sum(n_failures),
         .by = vehicle_age)

plot_labels_for_p2 <- data_for_plot |>
  filter(vehicle_age == max(vehicle_age))

max_y_count <- max(data_for_plot$n_failures)
max_y_pct <- max(data_for_plot$pct_of_failures)

p1 <- data_for_plot|>
  ggplot(aes(vehicle_age, n_failures, color = reason)) +
  geom_line(show.legend = FALSE) +
  scale_x_continuous(breaks = c(4, 8, 12),
                     expand = expansion(mult = c(0.02, 0.05))) +
  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.05))) +
  labs(
    subtitle = "Count of vehicle year failures",
    x = "Vehicle age",
    y = NULL
  )

p2 <- data_for_plot|>
  ggplot(aes(vehicle_age, pct_of_failures, color = reason)) +
  geom_line(show.legend = FALSE) +
  geom_text(data = plot_labels_for_p2,
            aes(vehicle_age + 0.5, pct_of_failures, label = reason),
            hjust = 0, check_overlap = TRUE, show.legend = FALSE
  ) +
  scale_x_continuous(breaks = c(4, 8, 12),
                     expand = expansion(mult = c(0, 0.05))) +
  scale_y_continuous(labels = label_percent(),
                     expand = expansion(mult = c(0.02, 0.05))) +
  coord_cartesian(xlim = c(4, 18)) +
  labs(
    subtitle = "Percent of vehicle year failures each year",
    x = "Vehicle age",
    y = NULL
  )

p3 <- data_for_plot|>
  ggplot(aes(vehicle_age, n_failures, color = reason)) +
  geom_smooth(show.legend = FALSE, size = 0.75,
              method = 'loess', formula = 'y ~ x', se = FALSE, span = 0.99) +
  scale_x_continuous(breaks = c(4, 8, 12),
                     expand = expansion(mult = c(0.02, 0.05))) +
  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale()),
                     expand = expansion(mult = c(0.02, 0.05))) +
  coord_cartesian(ylim = c(0, max_y_count)) +
  labs(
    subtitle = "Smoothed count of vehicle year failures",
    x = "Vehicle age",
    y = NULL
  )
  
p4 <- data_for_plot|>
  ggplot(aes(vehicle_age, pct_of_failures, color = reason)) +
  geom_smooth(show.legend = FALSE, size = 0.75,
              method = 'loess', formula = 'y ~ x', se = FALSE, span = 0.99) +
  geom_text(data = plot_labels_for_p2,
            aes(vehicle_age + 0.5, pct_of_failures, label = reason),
            hjust = 0, check_overlap = TRUE, show.legend = FALSE
  ) +
  scale_x_continuous(breaks = c(4, 8, 12),
                     expand = expansion(mult = c(0, 0.05))) +
  scale_y_continuous(labels = label_percent(),
                     expand = expansion(mult = c(0.02, 0.05))) +
  coord_cartesian(xlim = c(4, 18),
                  ylim = c(0, max_y_pct)) +
  labs(
    subtitle = "Smoothed percent of vehicle year failures each year",
    x = "Vehicle age",
    y = NULL
  )

p1 + p2 + p3 + p4 +
  plot_annotation(
    title = "Most common failure count and rate by vehicle age",
    caption = my_caption
  )

Figure 4.1: Most common failure count and rate by vehicle age (all brands)


4.3 Ranked most common reasons by vehicle age

The changes are easier to see in a simple ranking (Figure 4.2).

4.3.1 All brands

Show the code
plot_most_common_reasons <- function(tbl,
                                mytitle = "Most common failure reasons",
                                mysubtitle = "Ranked by number of failures. All years, brands, and models. Rank 1 is most common."
                                ) {

  # test 
  # tbl <- dta_reasons_all_yearly_weighted 
  # mytitle = "Failure reasons",
  # mysubtitle = "Ranked by number of failures. All years, brands, and models. Rank 1 is most common."

tbl |>
  ggplot(aes(vehicle_age, rank, color = reason)) + #failure_reason_1),
  geom_bump(show.legend = FALSE) +
  geom_point(show.legend = FALSE) +
  geom_label(aes(label = reason), #failure_reason_1),
             show.legend = FALSE,
             size = 5) +
  scale_x_continuous(breaks = 1:15,
                     position = "top") +
  scale_y_reverse(breaks = 1:15) +
  theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_text(size = 18),
        axis.title.x = element_text(size = 18, hjust = 0),
        axis.title.y = element_text(size = 18, hjust = 1)) +
  labs(
    title = mytitle,
    subtitle = mysubtitle,
    x = "Vehicle age",
    y = "Rank",
    caption = my_caption
  )
}

plot_most_common_reasons_freq <- function(tbl,
                                     mytitle = "Frequency of most common failure reasons",
                                     mysubtitle = "Ranked by number of failures. All years, brands, and models. Rank 1 is most common."
                                     ) {
  
  # test
  # tbl <- dta_reasons_all_yearly_weighted
  # mytitle = "Frequency of failure reasons",
  # mysubtitle = "Ranked by number of failures. All years, brands, and models. Rank 1 is most common."
  
  data_for_plot <-  tbl |>
  mutate(n_k = round(n_failures / 1000),
         plot_label = if_else(n_k == 0,
                              glue("{n_failures}"),
                              glue("{n_k}K"))
  )

data_for_label_left <- data_for_plot |>
  filter(vehicle_age == min(vehicle_age))

data_for_label_right <- data_for_plot |>
  filter(vehicle_age == max(vehicle_age))

data_for_plot |>
  ggplot(aes(vehicle_age, rank, color = reason)) +
  geom_bump(show.legend = FALSE) +
  geom_point(show.legend = FALSE) +
  geom_label(aes(label = plot_label, size = n_failures),
             show.legend = FALSE) +
  geom_text(data = data_for_label_left,
            aes(x = vehicle_age, y = rank, color = reason, label = reason),
            hjust = 1, nudge_x = -0.4, size = 7,
            show.legend = FALSE) +
  geom_text(data = data_for_label_right,
            aes(x = vehicle_age, y = rank, color = reason, label = reason),
            hjust = 0, nudge_x = 0.5, size = 7,
            show.legend = FALSE) +
  scale_x_continuous(breaks = 4:15,
                     position = "top") +
  scale_y_reverse(breaks = 1:15) +
  scale_size_continuous(range = c(4, 11)) +
  coord_cartesian(x = c(2.5, 17)) +
  theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_text(size = 18),
        axis.title.x = element_text(size = 18, hjust = 0),
        axis.title.y = element_text(size = 18, hjust = 1)) +
  labs(
    title = mytitle,
    subtitle = mysubtitle,
    x = "Vehicle age",
    y = "Rank",
    caption = my_caption
  )
}

plot_most_common_reasons_rate <- function(tbl,
                                     mytitle = "Frequency of most common failure reasons",
                                     mysubtitle = "Ranked by number of failures. All years, brands, and models. Rank 1 is most common."
                                     ) {
  
  # test
  # tbl <- dta_reasons_all_yearly_weighted
  # mytitle = "Frequency of failure reasons",
  # mysubtitle = "Ranked by number of failures. All years, brands, and models. Rank 1 is most common."
  
  data_for_plot <-  tbl |>
    mutate(pct_reason = n_failures / sum(n_failures),
           .by = c(vehicle_age)) |>
    mutate(pct_reason_all_years = n_failures / sum(n_failures)) |>
    mutate(
      plot_label = percent(pct_reason, accuracy = 1)
    )

data_for_label_left <- data_for_plot |>
  filter(vehicle_age == min(vehicle_age))

data_for_label_right <- data_for_plot |>
  filter(vehicle_age == max(vehicle_age))

data_for_plot |>
  ggplot(aes(vehicle_age, rank, color = reason)) +
  geom_bump(show.legend = FALSE) +
  geom_point(show.legend = FALSE) +
  geom_label(aes(label = plot_label, size = pct_reason_all_years),
             show.legend = FALSE) +
  geom_text(data = data_for_label_left,
            aes(x = vehicle_age, y = rank, color = reason, label = reason),
            hjust = 1, nudge_x = -0.4, size = 7,
            show.legend = FALSE) +
  geom_text(data = data_for_label_right,
            aes(x = vehicle_age, y = rank, color = reason, label = reason),
            hjust = 0, nudge_x = 0.5, size = 7,
            show.legend = FALSE) +
  scale_x_continuous(breaks = 4:15,
                     position = "top") +
  scale_y_reverse(breaks = 1:15) +
  scale_size_continuous(range = c(4, 9)) +
  coord_cartesian(x = c(2.5, 17)) +
  theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_text(size = 18),
        axis.title.x = element_text(size = 18, hjust = 0),
        axis.title.y = element_text(size = 18, hjust = 1)) +
  labs(
    title = mytitle,
    subtitle = mysubtitle,
    x = "Vehicle age",
    y = "Rank",
    caption = my_caption
  )
}
Show the code
plot_most_common_reasons(
  dta_reason_1_overall_yearly_weighted,
  mytitle = glue('Most common failure reasons'),
  mysubtitle = "Ranked by number of inspection failures. All years, brands, models, and reasons. Rank 1 is most common."
)

Figure 4.2: Most common reasons for inspection failure by vehicle age (all brands)


Figure 4.3 shows the same rankings with the label being the frequency of each ranked item.

Show the code
plot_most_common_reasons_freq(
  dta_reason_1_overall_yearly_weighted,
  mytitle = glue('Frequency of most common failure reasons for all brands'),
  mysubtitle = "Ranked by number of inspection failures. All years, brands, models and reasons. Rank 1 is most common."
)

Figure 4.3: Frequency of most common reasons for inspection failure by vehicle age (all brands)


Figure 4.4 shows the same rankings with the label being the rate of each ranked item in that column (one vehicle age).

Show the code
plot_most_common_reasons_rate(
  dta_reason_1_overall_yearly_weighted,
  mytitle = glue('Rate of failure reasons for all brands'),
  mysubtitle = glue("Ranked by number of inspection failures. All years, brands, models and reasons. Rank 1 is most common.",
                    "\nColumns may not sum to 100% due to rounding. Label size is relative to rates for all reasons all years all ranks.")
)

Figure 4.4: Rate of most common reasons for inspection failure by vehicle age (all brands)