crash_dat_tidy = read_csv("./data/crash_dat.csv")
My goal was to compare the rate of injuries in a crash in 2020 to the same month in 2019 (e.g., rate of injuries in January 2020 v. rate of injuries in January 2019). I examined January through October. I additionally stratified these rate ratio estimates by borough.
To accomplish this, I filtered the tidy data to create one dataset for microvehicles and one for bikes. In each dataset, I then created nested datasets by month, and, in each of these datasets, I mapped Poisson models to extract rate ratio estimates for number of injuries in each borough. Finally, I unnessted these models to extract the desired coefficients: rate ratios and standard errors (used to compute 95% confidence intervals).
Below, I write functions to easily filter for either microvehicles or for bikes.
filter_for = function(dataset, type) {
if (type == "micro") {
dataset %>% filter(str_detect(vehicle_options, "[Bb]ike") |
str_detect(vehicle_options, "REVEL") |
str_detect(vehicle_options, "SCO") |
str_detect(vehicle_options, "MOP") |
str_detect(vehicle_options, "ELEC") |
str_detect(vehicle_options, "^E-")) %>%
filter(vehicle_options != "ESCOVATOR" & vehicle_options != "Bike" &
str_detect(vehicle_options, "Dirt", negate = TRUE),
str_detect(vehicle_options, "[Mm]otorbike", negate = TRUE)
)
}
else if (type == "bike") {
dataset %>% filter(str_starts(vehicle_options, "[Bb]ike"))
}
}
month_df=
tibble(
month = 1:12,
month_name = factor(month.name, ordered = TRUE, levels = month.name)
)
First, I tested to see if there was an overall association of 2020 v. 2019 with number of injuries in microvehicle crashes.
fit_injuries_micro_all = crash_dat_tidy %>%
filter_for("micro") %>%
filter(year %in% c(2019, 2020)) %>%
group_by(year) %>%
mutate(year_2020 = year - 2019) %>%
nest(data = -month) %>%
mutate(models = map(data, ~glm(number_of_persons_injured ~ year_2020,
family = "poisson", data = .x)),
models = map(models, broom::tidy)) %>%
select(-data) %>%
unnest(models) %>%
select(month, term, estimate, std.error, p.value) %>%
mutate(term = str_replace(term, "year_2020", "2020 v. 2019")) %>%
left_join(month_df, by = "month") %>%
select(-month) %>%
rename(month = month_name) %>%
filter(term != "(Intercept)" & month != "November" & month != "December") %>%
select(month, everything())
fit_injuries_micro_all %>% knitr::kable(digits = 2)
month | term | estimate | std.error | p.value |
---|---|---|---|---|
January | 2020 v. 2019 | -0.29 | 0.40 | 0.47 |
February | 2020 v. 2019 | 0.74 | 0.72 | 0.30 |
March | 2020 v. 2019 | -0.43 | 0.32 | 0.18 |
April | 2020 v. 2019 | 0.14 | 0.42 | 0.74 |
May | 2020 v. 2019 | -0.23 | 0.26 | 0.39 |
June | 2020 v. 2019 | -0.12 | 0.23 | 0.61 |
July | 2020 v. 2019 | 0.13 | 0.16 | 0.42 |
August | 2020 v. 2019 | 0.07 | 0.11 | 0.51 |
September | 2020 v. 2019 | 0.04 | 0.11 | 0.69 |
October | 2020 v. 2019 | -0.06 | 0.12 | 0.60 |
Although there was no statistically significant evidence for a different rate of injury in 2020 v. 21 in any month, I proceeded to examine this stratified by borough.
I also wanted to get descriptives for number of microvehicle crashes and injuries in each borough in each month.
micro_num = crash_dat_tidy %>%
filter_for("micro") %>%
filter(year %in% c(2019, 2020)) %>%
group_by(year, month, borough) %>%
summarise(number_of_crashes = n(),
mean_number_of_injuries = mean(number_of_persons_injured)) %>%
drop_na(borough) %>%
left_join(month_df, by = "month") %>%
as.tibble() %>%
mutate(borough = str_to_title(borough))
micro_num %>%
select(year, month_name, borough, number_of_crashes,
mean_number_of_injuries) %>%
head() %>% knitr::kable(digits = 2)
year | month_name | borough | number_of_crashes | mean_number_of_injuries |
---|---|---|---|---|
2019 | January | Brooklyn | 2 | 1.00 |
2019 | January | Manhattan | 1 | 1.00 |
2019 | January | Queens | 1 | 1.00 |
2019 | February | Bronx | 2 | 0.50 |
2019 | February | Brooklyn | 2 | 0.00 |
2019 | March | Bronx | 3 | 1.33 |
micro_num %>%
mutate(date = as.yearmon(paste(year, month, sep = "-"))) %>%
ggplot(aes(x = date, y = mean_number_of_injuries, fill = borough)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(
title = "Mean Number of Injuries per Microvehicle Crash by Borough",
x = "Month",
y = "Mean Number of Injuries"
) +
theme(legend.position="right", legend.title = element_blank(),
text = element_text(size = 10),
axis.text.x = element_text(angle = 60, hjust = 1, size = 8)) +
facet_grid(. ~ borough)
micro_num %>%
mutate(date = as.yearmon(paste(year, month, sep = "-"))) %>%
ggplot(aes(x = date, y = number_of_crashes, fill = borough)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(
title = "Number of Crashes by Borough",
x = "Month",
y = "Number of Crashes"
) +
theme(legend.position="right", legend.title = element_blank(),
text = element_text(size = 10),
axis.text.x = element_text(angle = 60, hjust = 1, size = 8)) +
facet_grid(. ~ borough)
It is apparent that there are very low numbers of microvehicle crashes when stratifying by both month and borough - sometimes as low as only 1 crash, or even 0 crashes (particularly in Staten Island).
Below, I extract the rate ratio estimates for the microvehicle data by borough.
fit_injuries_micro = crash_dat_tidy %>%
filter_for("micro") %>%
filter(year %in% c(2019, 2020)) %>%
mutate(borough = str_to_title(borough)) %>%
group_by(year) %>%
mutate(year_2020 = year - 2019) %>%
nest(data = -month) %>%
mutate(models = map(data, ~glm(number_of_persons_injured ~ year_2020:borough,
family = "poisson", data = .x)),
models = map(models, broom::tidy)) %>%
select(-data) %>%
unnest(models) %>%
select(month, term, estimate, std.error, p.value) %>%
mutate(term = str_replace(term, "year_2020:borough", "2020 v. 2019, Borough: ")) %>%
left_join(month_df, by = "month") %>%
select(-month) %>%
rename(month = month_name) %>%
select(month, everything())
fit_injuries_micro %>%
filter(term != "(Intercept)" & month != "November" & month != "December" & p.value < .05) %>%
knitr::kable(digits = 2)
month | term | estimate | std.error | p.value |
---|
Here, I extract my coefficient of interest, which is the difference in injury rate in microvehicle crashes occurring in 2020 v. occurring in 2019. I plot estimates for each borough within each month, making sure to exponentiate these estimates. I include a horizontal line at Y = 1 to indicate the null value.
fit_injuries_micro %>%
filter(term != "(Intercept)" & month != "November" & month != "December") %>%
mutate(term = str_replace(term, "2020 v. 2019, ", "")) %>%
ggplot(aes(x = month, y = exp(estimate), color = term)) +
geom_point(show.legend = FALSE, aes(size = estimate, alpha = .7)) +
geom_errorbar(aes(ymin = exp(estimate - (1.96*std.error)),
ymax = exp(estimate + (1.96*std.error)))) +
geom_hline(yintercept = 1, linetype="dashed",
color = "darkred", size = 1, alpha = .7) +
labs(
title = "Difference in Rate of Injuries Per Crash Involving a Microvehicle in 2020 v. 2019",
x = "Month",
y = "2020 v. 2019 Difference"
) +
ylim(0, 5) +
theme(legend.position="right", legend.title = element_blank(),
text = element_text(size = 10),
axis.text.x = element_text(angle = 60, hjust = 1, size = 8)) +
facet_grid(. ~ term)
There is no discernable pattern in the figure, suggesting that rates of injury per microvehicle crash did not differ between 2020 and 2019. In part, this could be due to very wide confidence intervals.
Again, I first tested to see if there was an overall association of 2020 v. 2019 with number of injuries in bike crashes (as in the microvehicle data).
fit_injuries_bike_all = crash_dat_tidy %>%
filter_for("bike") %>%
filter(year %in% c(2019, 2020)) %>%
group_by(year) %>%
mutate(year_2020 = year - 2019) %>%
nest(data = -month) %>%
mutate(models = map(data, ~glm(number_of_persons_injured ~ year_2020,
family = "poisson", data = .x)),
models = map(models, broom::tidy)) %>%
select(-data) %>%
unnest(models) %>%
select(month, term, estimate, std.error, p.value) %>%
mutate(term = str_replace(term, "year_2020", "2020 v. 2019")) %>%
left_join(month_df, by = "month") %>%
select(-month) %>%
rename(month = month_name) %>%
filter(term != "(Intercept)" & month != "November" & month != "December") %>%
select(month, everything())
fit_injuries_bike_all %>% knitr::kable(digits = 2)
month | term | estimate | std.error | p.value |
---|---|---|---|---|
January | 2020 v. 2019 | 0.01 | 0.09 | 0.89 |
February | 2020 v. 2019 | 0.03 | 0.10 | 0.78 |
March | 2020 v. 2019 | 0.13 | 0.09 | 0.13 |
April | 2020 v. 2019 | 0.01 | 0.10 | 0.96 |
May | 2020 v. 2019 | 0.01 | 0.07 | 0.90 |
June | 2020 v. 2019 | 0.02 | 0.06 | 0.69 |
July | 2020 v. 2019 | 0.05 | 0.06 | 0.35 |
August | 2020 v. 2019 | 0.04 | 0.06 | 0.51 |
September | 2020 v. 2019 | 0.04 | 0.06 | 0.47 |
October | 2020 v. 2019 | 0.08 | 0.07 | 0.24 |
Although there was no statistically significant evidence for a different rate of injury in 2020 v. 21 in any month, I proceeded to examine this stratified by borough.
I also wanted to get descriptives for number of bike crashes and injuries in each borough in each month.
bike_num = crash_dat_tidy %>%
filter_for("bike") %>%
filter(year %in% c(2019, 2020)) %>%
group_by(year, month, borough) %>%
summarise(number_of_crashes = n(),
mean_number_of_injuries = mean(number_of_persons_injured)) %>%
drop_na(borough) %>%
left_join(month_df, by = "month") %>%
as.tibble() %>%
mutate(borough = str_to_title(borough))
bike_num %>%
select(year, month_name, borough, number_of_crashes,
mean_number_of_injuries) %>%
head() %>% knitr::kable(digits = 2)
year | month_name | borough | number_of_crashes | mean_number_of_injuries |
---|---|---|---|---|
2019 | January | Bronx | 17 | 0.76 |
2019 | January | Brooklyn | 74 | 0.85 |
2019 | January | Manhattan | 67 | 0.76 |
2019 | January | Queens | 41 | 0.88 |
2019 | January | Staten Island | 1 | 0.00 |
2019 | February | Bronx | 18 | 0.83 |
bike_num %>%
mutate(date = as.yearmon(paste(year, month, sep = "-"))) %>%
ggplot(aes(x = date, y = mean_number_of_injuries, fill = borough)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(
title = "Mean Number of Injuries per Bike Crash by Borough",
x = "Month",
y = "Mean Number of Injuries"
) +
theme(legend.position="right", legend.title = element_blank(),
text = element_text(size = 10),
axis.text.x = element_text(angle = 60, hjust = 1, size = 8)) +
facet_grid(. ~ borough)
bike_num %>%
mutate(date = as.yearmon(paste(year, month, sep = "-"))) %>%
ggplot(aes(x = date, y = number_of_crashes, fill = borough)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(
title = "Number of Bike Crashes by Borough",
x = "Month",
y = "Number of Crashes"
) +
theme(legend.position="right", legend.title = element_blank(),
text = element_text(size = 10),
axis.text.x = element_text(angle = 60, hjust = 1, size = 8)) +
facet_grid(. ~ borough)
It is again apparent that there are very low numbers of crashes when stratifying by both month and borough.
Below, I extract the rate ratio estimates for the bike data by borough (as in the microvehicle data).
fit_injuries_bike = crash_dat_tidy %>%
filter_for("bike") %>%
mutate(borough = str_to_title(borough)) %>%
filter(year %in% c(2019, 2020)) %>%
group_by(year) %>%
mutate(year_2020 = year - 2019) %>%
nest(data = -month) %>%
mutate(models = map(data, ~glm(number_of_persons_injured ~ year_2020:borough,
family = "poisson", data = .x)),
models = map(models, broom::tidy)) %>%
select(-data) %>%
unnest(models) %>%
select(month, term, estimate, std.error, p.value) %>%
mutate(term = str_replace(term, "year_2020:borough", "2020 v. 2019, Borough: ")) %>%
left_join(month_df, by = "month") %>%
select(-month) %>%
rename(month = month_name) %>%
select(month, everything())
fit_injuries_bike %>%
filter(term != "(Intercept)" & month != "November" & month != "December" & p.value < .05) %>%
knitr::kable(digits = 2)
month | term | estimate | std.error | p.value |
---|---|---|---|---|
March | 2020 v. 2019, Borough: Brooklyn | 0.27 | 0.13 | 0.03 |
Below, I plot the estimates for the bike data (as in the microvehicle data).
fit_injuries_bike %>%
filter(term != "(Intercept)" & month != "November" & month != "December") %>%
mutate(term = str_replace(term, "2020 v. 2019, ", "")) %>%
ggplot(aes(x = month, y = exp(estimate), color = term)) +
geom_point(show.legend = FALSE, aes(size = estimate, alpha = .7)) +
geom_errorbar(aes(ymin = exp(estimate - (1.96*std.error)),
ymax = exp(estimate + (1.96*std.error)))) +
geom_hline(yintercept = 1, linetype="dashed",
color = "darkred", size = 1, alpha = .7) +
labs(
title = "Difference in Rate of Injuries Per Crash Involving a Bike in 2020 v. 2019",
x = "Month",
y = "2020 v. 2019 Difference"
) +
ylim(0, 5) +
theme(legend.position="right", legend.title = element_blank(),
text = element_text(size = 10),
axis.text.x = element_text(angle = 60, hjust = 1, size = 8)) +
facet_grid(. ~ term)
There is no discernable pattern in the figure, suggesting that rates of injury per bike crash did not differ between 2020 and 2019.
In sum, there was no evidence that rates of injuries per crash differed between 2020 and 2019 for crashes that involved either microvehicles or bikes.