if(!require("tidyverse")) install.packages("tidyverse")
# Let's start with Fare Revenue
library(tidyverse)
if(!file.exists("2022_fare_revenue.xlsx")){
# This should work _in theory_ but in practice it's still a bit finicky
# If it doesn't work for you, download this file 'by hand' in your
# browser and save it as "2022_fare_revenue.xlsx" in your project
# directory.
download.file("http://www.transit.dot.gov/sites/fta.dot.gov/files/2024-04/2022%20Fare%20Revenue.xlsx",
destfile="2022_fare_revenue.xlsx",
quiet=FALSE,
method="wget")
}<- readxl::read_xlsx("2022_fare_revenue.xlsx") |>
FARES select(-`State/Parent NTD ID`,
-`Reporter Type`,
-`Reporting Module`,
-`TOS`,
-`Passenger Paid Fares`,
-`Organization Paid Fares`) |>
filter(`Expense Type` == "Funds Earned During Period") |>
select(-`Expense Type`)
# Next, expenses
if(!file.exists("2022_expenses.csv")){
# This should work _in theory_ but in practice it's still a bit finicky
# If it doesn't work for you, download this file 'by hand' in your
# browser and save it as "2022_expenses.csv" in your project
# directory.
download.file("https://data.transportation.gov/api/views/dkxx-zjd6/rows.csv?date=20231102&accessType=DOWNLOAD&bom=true&format=true",
destfile="2022_expenses.csv",
quiet=FALSE,
method="wget")
}<- readr::read_csv("2022_expenses.csv") |>
EXPENSES select(`NTD ID`,
`Agency`,
`Total`,
`Mode`) |>
mutate(`NTD ID` = as.integer(`NTD ID`)) |>
rename(Expenses = Total) |>
group_by(`NTD ID`, `Mode`) |>
summarize(Expenses = sum(Expenses)) |>
ungroup()
Mini Project#1
Introduction
This report presents an in-depth analysis of 2022 transit data, highlighting key aspects such as fare revenues, expenses, transit usage, and performance across different agencies and transportation modes in the United States. The analysis covers trends in unlinked passenger trips (UPT), vehicle revenue miles (VRM), farebox recovery ratios, and cost efficiency measured by expenses per UPT. Publicly available data, including sources from the U.S. Department of Transportation, was processed using R to extract valuable insights.
Data Overview
The data used in this analysis includes the following:
Fare Revenue Data: Data on fare revenue earned by various transit systems. Expense Data: Data related to transit expenses. Ridership Data: Data on monthly figures on unlinked passenger trips (UPT) and vehicle revenue miles (VRM) for different transit agencies and modes.
The previous code will download, clean, and join the tables. The 2022 Fare Revenue file contains all the data of transit Agency, Urbanized Area Name, Mode of transportation, date, Number of Passengers, metro area, and the miles that vehicles travel while in revenue service. The file will later be joined to the expense file that documents the expenses in 2022.
Using the above data we can create a comprehensive analysis of how much each transit agency earns in fares relative to their expenses. Later into the project, I will be joining the fare revenue data with the expense file, we can directly compare the income generated by different transit modes against the expenses incurred in running these services. This comparison will provide insights into which transit system in the country are more effective to another.
# Monthly Transit Numbers
library(tidyverse)
if(!file.exists("ridership.xlsx")){
# This should work _in theory_ but in practice it's still a bit finicky
# If it doesn't work for you, download this file 'by hand' in your
# browser and save it as "ridership.xlsx" in your project
# directory.
download.file("https://www.transit.dot.gov/sites/fta.dot.gov/files/2024-09/July%202024%20Complete%20Monthly%20Ridership%20%28with%20adjustments%20and%20estimates%29_240903.xlsx",
destfile="ridership.xlsx",
quiet=FALSE,
method="wget")
}<- readxl::read_xlsx("ridership.xlsx", sheet="UPT") |>
TRIPS filter(`Mode/Type of Service Status` == "Active") |>
select(-`Legacy NTD ID`,
-`Reporter Type`,
-`Mode/Type of Service Status`,
-`UACE CD`,
-`TOS`) |>
pivot_longer(-c(`NTD ID`:`3 Mode`),
names_to="month",
values_to="UPT") |>
drop_na() |>
mutate(month=my(month)) # Parse _m_onth _y_ear date specs
<- readxl::read_xlsx("ridership.xlsx", sheet="VRM") |>
MILES filter(`Mode/Type of Service Status` == "Active") |>
select(-`Legacy NTD ID`,
-`Reporter Type`,
-`Mode/Type of Service Status`,
-`UACE CD`,
-`TOS`) |>
pivot_longer(-c(`NTD ID`:`3 Mode`),
names_to="month",
values_to="VRM") |>
drop_na() |>
group_by(`NTD ID`, `Agency`, `UZA Name`,
`Mode`, `3 Mode`, month) |>
summarize(VRM = sum(VRM)) |>
ungroup() |>
mutate(month=my(month)) # Parse _m_onth _y_ear date specs
<- inner_join(TRIPS, MILES) |>
USAGE mutate(`NTD ID` = as.integer(`NTD ID`))
if(!require("DT")) install.packages("DT")
library(DT)
sample_n(USAGE, 1000) |>
mutate(month=as.character(month)) |>
::datatable() DT
<- USAGE |>
USAGE rename(`metro_area` = `UZA Name`)
<- inner_join(FARES, EXPENSES, join_by(`NTD ID`, `Mode`)) FINANCIALS
Tidyverse will help the clarity of the data by manipulating and visualizing the data. Using the select() and filter() functions from dplyr package, I am able to make the data set more manageable and focused on relevant data for specific questions to answer. Columns such as State/Parent NYT ID would not be needed and thus, excluded out. Similar to the Fares and Expenses files, a csv.reader is used to scan large files to output. The rename function would change UZA Name to metro_area, thus becoming more coherent. The USAGE table was created By joining the TRIPS table and MILES table and the FINANCIALS table was created by joining the FARES and EXPENSES tables.
<- USAGE |>
USAGE mutate(Mode=case_when(
== "HR" ~ "Heavy Rail",
Mode == "LR" ~ "Light Rail",
Mode == "CR" ~ "Commuter Rail",
Mode == "IP" ~ "Inclined Plane",
Mode == "CC" ~ "Cable Car",
Mode == "MG" ~ "Monorail/Automated Guideway",
Mode == "FB" ~ "Ferryboats",
Mode == "TR" ~ "Aerial Tramways",
Mode == "MB" ~ "Bus",
Mode == "TB" ~ "Trolleybus",
Mode == "CB" ~ "Commuter Bus",
Mode == "RB" ~ "Bus Rapid Transit",
Mode == "PB" ~ "Publico",
Mode == "DR" ~ "Demand Reponse",
Mode == "VP" ~ "Vanpool",
Mode == "YR" ~ "Hybrid Rail",
Mode == "SR" ~ "Streetcar Rail",
Mode == "AR" ~ "Alaska Railroad",
Mode TRUE ~ "Unknown"))
if(!require("DT")) install.packages("DT")
library(DT)
sample_n(USAGE, 1000) |>
mutate(month=as.character(month)) |>
::datatable() DT
The Mode column uses a set of codes that aren’t interpretative. To make the code easy to read, a case_when statement is to transform into something we can make sense of. From the NTD website, I was able to find the abbreviations and reverse it.
## 3A.What transit agency had the most total VRM in our data set?
<- USAGE |>
Agency_VRM group_by(`Agency`)|>
summarize(total_VRM = sum(VRM, na.rm = TRUE))
<- max(Agency_VRM$total_VRM, na.rm = TRUE)
max_VRM
<- Agency_VRM |>
top_Agency filter(total_VRM == max_VRM)
print(top_Agency)
# A tibble: 1 × 2
Agency total_VRM
<chr> <dbl>
1 MTA New York City Transit 10832855350
## MTA New York City Transit had the most total VRM in this sample
## 3B.What transit mode had the most total VRM in our data set?
<- USAGE |>
Mode_VRM group_by(`Mode`)|>
summarize(total_VRM = sum(VRM, na.rm = TRUE))
<- max(Mode_VRM$total_VRM, na.rm = TRUE)
max_VRM
<- Mode_VRM |>
top_mode filter(total_VRM == max_VRM)
print(top_mode)
# A tibble: 1 × 2
Mode total_VRM
<chr> <dbl>
1 Bus 49444494088
## BUS had the most total VRM in this sample
## 3C.How many trips were taken on the NYC Subway (Heavy Rail) in May 2024?
<- TRIPS |>
nyc_subway_may_trips filter(Mode == "HR")|>
filter(Agency == 'MTA New York City Transit')|>
filter(month == "2024-05-01")|>
select(UPT)
print(nyc_subway_may_trips)
# A tibble: 1 × 1
UPT
<dbl>
1 180458819
## 180,458,819 trips were taken on the NYC Subway (Heavy Rail) in May 2024
## 3E.How much did NYC subway ridership fall between April 2019 and April 2020?
<- TRIPS |>
nyc_subway_april_trips filter(Mode == 'HR')|>
filter(Agency == 'MTA New York City Transit')|>
filter(between(month, as.Date("2019-04-01"), as.Date("2020-04-01")))|>
summarize(UPT = sum(UPT, na.rm = TRUE))
print(nyc_subway_april_trips)
# A tibble: 1 × 1
UPT
<dbl>
1 2706415856
## 2,706,415,856 subway ridership fall between April 2019 and April 2020
## 4A.What metro area had the most total VRM in our data set?
<- USAGE |>
metro_area_VRM group_by(`metro_area`)|>
summarize(total_VRM = sum(VRM, na.rm = TRUE))
<- max(metro_area_VRM$total_VRM, na.rm = TRUE)
max_VRM
<- metro_area_VRM |>
top_metro_area filter(total_VRM == max_VRM)
print(top_metro_area)
# A tibble: 1 × 2
metro_area total_VRM
<chr> <dbl>
1 New York--Jersey City--Newark, NY--NJ 21190345637
## New York--Jersey City--Newark, NY--NJ had the most total VRM in this sample
## 4B.How much UPT were taken on the Light Rail in November 2023?
<- TRIPS |>
nyc_subway_may_trips filter(Mode == "LR")|>
filter(month == "2023-11-01")|>
summarize(UPT = sum(UPT, na.rm = TRUE))
print(nyc_subway_may_trips)
# A tibble: 1 × 1
UPT
<dbl>
1 27166436
## 27,166,436 UPT were taken on the Light Rail in November 2023
## 4C.How much did NYC subway ridership fall between March 2020 and March 2022?
<- TRIPS |>
nyc_subway_march_trips filter(Mode == 'HR')|>
filter(Agency == 'MTA New York City Transit')|>
filter(between(month, as.Date("2020-03-01"), as.Date("2022-03-01")))|>
summarize(UPT = sum(UPT, na.rm = TRUE))
print(nyc_subway_march_trips)
# A tibble: 1 × 1
UPT
<dbl>
1 2341451028
## 2,341,451,028 subway ridership fall between March 2020 and March 2022
The following questions are some examples that the data set can answer:
1.What transit agency had the most total VRM in our data set? MTA New York City Transit had the most total VRM in this sample
2.What transit mode had the most total VRM in our data set? BUS had the most total VRM in this sample
3.How many trips were taken on the NYC Subway (Heavy Rail) in May 2024? 180,458,819 trips were taken on the NYC Subway (Heavy Rail) in May 2024
4.How much did NYC subway ridership fall between April 2019 and April 2020? 2,706,415,856 subway ridership fall between April 2019 and April 2020
5.What metro area had the most total VRM in our data set? New York–Jersey City–Newark, NY–NJ had the most total VRM in this sample
6.How much UPT were taken on the Light Rail in November 2023? 27,166,436 UPT were taken on the Light Rail in November 2023
7.How much did NYC subway ridership fall between March 2020 and March 2022? 2,341,451,028 subway ridership fall between March 2020 and March 2022
<- FINANCIALS |>
FINANCIALS mutate(Mode=case_when(
== "HR" ~ "Heavy Rail",
Mode == "LR" ~ "Light Rail",
Mode == "CR" ~ "Commuter Rail",
Mode == "IP" ~ "Inclined Plane",
Mode == "CC" ~ "Cable Car",
Mode == "MG" ~ "Monorail/Automated Guideway",
Mode == "FB" ~ "Ferryboats",
Mode == "TR" ~ "Aerial Tramways",
Mode == "MB" ~ "Bus",
Mode == "TB" ~ "Trolleybus",
Mode == "CB" ~ "Commuter Bus",
Mode == "RB" ~ "Bus Rapid Transit",
Mode == "PB" ~ "Publico",
Mode == "DR" ~ "Demand Reponse",
Mode == "VP" ~ "Vanpool",
Mode == "YR" ~ "Hybrid Rail",
Mode == "SR" ~ "Streetcar Rail",
Mode == "AR" ~ "Alaska Railroad",
Mode TRUE ~ "Unknown"))
#5: Table Summarization
library(dplyr)
library(lubridate)
<- USAGE |>
USAGE_2022_ANNUAL filter(year(month) == 2022) |>
group_by(`NTD ID`, `Agency`, `metro_area`, `Mode`) |>
summarize(
UPT = sum(UPT, na.rm = TRUE),
VRM = sum(VRM, na.rm = TRUE)
|>
) ungroup()
print(USAGE_2022_ANNUAL)
# A tibble: 1,141 × 6
`NTD ID` Agency metro_area Mode UPT VRM
<int> <chr> <chr> <chr> <dbl> <dbl>
1 1 King County Seattle--… Bus 5.40e7 6.16e7
2 1 King County Seattle--… Dema… 6.63e5 1.29e7
3 1 King County Seattle--… Ferr… 4.00e5 5.12e4
4 1 King County Seattle--… Stre… 1.12e6 1.80e5
5 1 King County Seattle--… Trol… 9.58e6 2.64e6
6 1 King County Seattle--… Vanp… 7.03e5 4.41e6
7 2 Spokane Transit Authority Spokane, … Bus 6.60e6 6.49e6
8 2 Spokane Transit Authority Spokane, … Dema… 3.10e5 4.04e6
9 2 Spokane Transit Authority Spokane, … Vanp… 9.06e4 9.06e5
10 3 Pierce County Transportation Benefit… Seattle--… Bus 4.95e6 4.23e6
# ℹ 1,131 more rows
<- left_join(USAGE_2022_ANNUAL,
USAGE_AND_FINANCIALS
FINANCIALS,join_by(`NTD ID`, Mode)) |>
drop_na()
Similarly to the USAGE MODE Column, to ensure all the data is consistent, it is crucial to do the same in the FINANCALS table.The code filter uses the year() function from lubridate package It then groups the data by NTD ID
, Agency, metro_area, Mode.The summarize function calculates the total Unlinked Passenger Trips (UPT) and Vehicle Revenue Miles and sums up the total. After summarizing, ungroup() is called to remove the grouping,making the dataset easier to work with.
## 6A.Which transit system (agency and mode) had the most UPT in 2022?
<- USAGE_AND_FINANCIALS |>
financials_summary group_by(`Agency`, `Mode`) |>
summarize(total_UPT = sum(UPT, na.rm = TRUE))
<- max(financials_summary$total_UPT, na.rm = TRUE)
max_UPT
<- financials_summary |>
top_transit_system filter(total_UPT == max_UPT)
print(top_transit_system)
# A tibble: 1 × 3
# Groups: Agency [1]
Agency Mode total_UPT
<chr> <chr> <dbl>
1 MTA New York City Transit Heavy Rail 1793073801
## MTA New York City Transit Heavy Rail (Heavy Rail) has the most UPT in 2022
## 6B.Which transit system (agency and mode) had the highest farebox recovery, defined as the highest ratio of Total Fares to Expenses?
<- USAGE_AND_FINANCIALS |>
financials_summary group_by(Agency, Mode) |>
summarize(total_farebox_recovery = sum(`Total Fares`, na.rm = TRUE) / sum(Expenses, na.rm = TRUE), .groups= "drop")
<- max(financials_summary$total_farebox_recovery, na.rm = TRUE)
max_farebox_recovery
<- financials_summary |>
highest_farebox_recovery filter(total_farebox_recovery == max_farebox_recovery)
print(highest_farebox_recovery)
# A tibble: 1 × 3
Agency Mode total_farebox_recovery
<chr> <chr> <dbl>
1 Transit Authority of Central Kentucky Vanpool 2.38
## Transit Authority of Central Kentucky (Vanpool)
##6C. Which transit system (agency and mode) has the lowest expenses per UPT?
<- USAGE_AND_FINANCIALS |>
financials_summary group_by(Agency, Mode) |>
summarize(expenses_per_UPT = sum(Expenses, na.rm = TRUE) / sum(UPT, na.rm = TRUE), .groups = "drop")
<- min(financials_summary$expenses_per_UPT, na.rm = TRUE)
min_expenses_per_UPT
<- financials_summary |>
lowest_expense_transit_system filter(expenses_per_UPT == min_expenses_per_UPT)
print(lowest_expense_transit_system)
# A tibble: 1 × 3
Agency Mode expenses_per_UPT
<chr> <chr> <dbl>
1 North Carolina State University Bus 1.18
## North Carolina State University (Bus)
##6D.Which transit system (agency and mode) has the highest total fares per UPT?
<- USAGE_AND_FINANCIALS |>
financials_summary group_by(Agency, Mode) |>
summarize(total_fares_per_UPT = sum(`Total Fares`, na.rm = TRUE) / sum(UPT, na.rm = TRUE), .groups = "drop")
<- max(financials_summary$total_fares_per_UPT, na.rm = TRUE)
max_total_fares_per_UPT
<- financials_summary |>
highest_total_fares_transit_system filter(total_fares_per_UPT == max_total_fares_per_UPT)
print(highest_total_fares_transit_system)
# A tibble: 1 × 3
Agency Mode total_fares_per_UPT
<chr> <chr> <dbl>
1 Altoona Metro Transit Demand Reponse 330.
## Altoona Metro Transit (Demand Reponse)
## 6E. Which transit system (agency and mode) has the lowest expenses per VRM?
<- USAGE_AND_FINANCIALS |>
financials_summary group_by(Agency, Mode) |>
summarize(expenses_per_VRM = sum(Expenses, na.rm = TRUE) / sum(VRM, na.rm = TRUE), .groups = "drop")
<- min(financials_summary$expenses_per_VRM, na.rm = TRUE)
min_expenses_per_VRM
<- financials_summary |>
lowest_expense_transit_system_VRM filter(expenses_per_VRM == min_expenses_per_VRM)
print(lowest_expense_transit_system_VRM)
# A tibble: 1 × 3
Agency Mode expenses_per_VRM
<chr> <chr> <dbl>
1 New Mexico Department of Transportation Vanpool 0.337
## New Mexico Department of Transportation (Vanpool)
### 6F.Which transit system (agency and mode) has the highest total fares per VRM?
<- USAGE_AND_FINANCIALS |>
financials_summary group_by(Agency, Mode) |>
summarize(total_fares_per_VRM = sum(`Total Fares`, na.rm = TRUE) / sum(VRM, na.rm = TRUE), .groups = "drop")
<- max(financials_summary$total_fares_per_VRM, na.rm = TRUE)
max_total_fares_per_VRM
<- financials_summary |>
highest_total_fares_transit_system_VRM filter(total_fares_per_VRM == max_total_fares_per_VRM)
print(highest_total_fares_transit_system_VRM)
# A tibble: 1 × 3
Agency Mode total_fares_per_VRM
<chr> <chr> <dbl>
1 Chicago Water Taxi (Wendella) Ferryboats 237.
### Chicago Water Taxi (Wendella) (Ferryboats)
The following questions are some examples that the data set can answer:
1.Which transit system (agency and mode) had the most UPT in 2022? MTA New York City Transit Heavy Rail (Heavy Rail) has the most UPT in 2022.
2.Which transit system (agency and mode) had the highest farebox recovery, defined as the highest ratio of Total Fares to Expenses? Transit Authority of Central Kentucky (Vanpool) had the highest farebox recovery.
3.Which transit system (agency and mode) has the lowest expenses per UPT? North Carolina State University (Bus) has the lowest expenses per UPT.
4.Which transit system (agency and mode) has the highest total fares per UPT? Altoona Metro Transit (Demand Reponse) has the highest total fares per UPT.
5.Which transit system (agency and mode) has the lowest expenses per VRM? New Mexico Department of Transportation (Vanpool) has the lowest expenses per VRM.
6.Which transit system (agency and mode) has the highest total fares per VRM? Chicago Water Taxi (Wendella) (Ferryboats) has the highest total fares per VRM.
Conclusion
This analysis emphasizes the performance of transit systems across the U.S, with New York City’s MTA consistently ranking highest in both VRM and UPT. These results offer valuable insights for transit agencies and serves a purpose as a benchmark for future performance.