Data Quality QA: Pennsylvania Enrollment Data
Source:vignettes/data-quality-qa.Rmd
data-quality-qa.RmdOverview
This vignette performs data quality analysis on Pennsylvania enrollment data from the Pennsylvania Department of Education (PDE). We examine:
- Statewide enrollment time series (looking for anomalies)
- Year-over-year changes (flagging jumps > 5%)
- Major district enrollment trends
- Data quality issues discovered
Note: This vignette requires network access to fetch data from PDE. Run the code interactively to execute the analysis.
Data Collection
Fetch enrollment data for available years. PDE provides data from 2005-present, though format consistency varies.
# Attempt to fetch all available years
# Start with more recent years that have more consistent formatting
years_to_fetch <- 2015:2024
# Fetch data with error handling
all_enr <- tryCatch({
fetch_enr_years(years_to_fetch)
}, error = function(e) {
message("Error fetching some years: ", e$message)
NULL
})
if (is.null(all_enr) || nrow(all_enr) == 0) {
stop("Could not fetch any enrollment data. Check network connection and PDE website availability.")
}
# Report what years we got
years_fetched <- unique(all_enr$end_year)
message("Successfully fetched data for years: ", paste(years_fetched, collapse = ", "))Statewide Time Series Analysis
Total Enrollment Trend
state_totals <- all_enr %>%
filter(
is_state | type == "Statewide",
subgroup == "total_enrollment",
grade_level == "TOTAL"
) %>%
select(end_year, n_students) %>%
arrange(end_year) %>%
distinct()
# If no statewide rows exist, create from district sums
if (nrow(state_totals) == 0) {
state_totals <- all_enr %>%
filter(
is_district,
subgroup == "total_enrollment",
grade_level == "TOTAL"
) %>%
group_by(end_year) %>%
summarize(n_students = sum(n_students, na.rm = TRUE), .groups = "drop") %>%
arrange(end_year)
}
state_totalsYear-over-Year Changes
Flag any year with enrollment change > 5% as potentially anomalous.
state_yoy <- state_totals %>%
arrange(end_year) %>%
mutate(
prev_year_enr = lag(n_students),
change = n_students - prev_year_enr,
pct_change = (n_students - prev_year_enr) / prev_year_enr * 100,
flag_large_change = abs(pct_change) > 5
)
# Display changes
state_yoy %>%
select(end_year, n_students, change, pct_change, flag_large_change) %>%
mutate(
pct_change = round(pct_change, 2),
change = format(change, big.mark = ",")
)Visualization
ggplot(state_totals, aes(x = end_year, y = n_students)) +
geom_line(color = "steelblue", linewidth = 1) +
geom_point(color = "steelblue", size = 2) +
scale_y_continuous(labels = comma, limits = c(0, NA)) +
scale_x_continuous(breaks = seq(min(state_totals$end_year), max(state_totals$end_year), by = 1)) +
labs(
title = "Pennsylvania Public School Enrollment",
subtitle = paste("School Years", min(state_totals$end_year) - 1, "-", min(state_totals$end_year),
"through", max(state_totals$end_year) - 1, "-", max(state_totals$end_year)),
x = "End Year (e.g., 2024 = 2023-24 school year)",
y = "Total Enrollment"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Major District Analysis
Analyze enrollment trends for Pennsylvania’s largest urban districts.
Major District AUNs
Pennsylvania’s largest districts by enrollment:
| District | AUN |
|---|---|
| Philadelphia City SD | 126515001 |
| Pittsburgh SD | 102027451 |
| Allentown City SD | 121390302 |
| Reading SD | 114067503 |
| Erie City SD | 105252602 |
# Define major district AUNs
major_districts <- tibble::tibble(
aun = c("126515001", "102027451", "121390302", "114067503", "105252602"),
district_name = c("Philadelphia", "Pittsburgh", "Allentown", "Reading", "Erie")
)
# Get district-level enrollment
district_enr <- all_enr %>%
filter(
is_district,
subgroup == "total_enrollment",
grade_level == "TOTAL"
) %>%
select(end_year, aun, lea_name, n_students)
# Filter to major districts
major_district_enr <- district_enr %>%
inner_join(major_districts, by = "aun") %>%
select(end_year, district_name, lea_name, n_students) %>%
arrange(district_name, end_year)
major_district_enrMajor District Year-over-Year Changes
major_yoy <- major_district_enr %>%
group_by(district_name) %>%
arrange(end_year) %>%
mutate(
prev_year_enr = lag(n_students),
pct_change = (n_students - prev_year_enr) / prev_year_enr * 100,
flag = abs(pct_change) > 5
) %>%
ungroup() %>%
filter(!is.na(pct_change))
# Show any large changes
major_flags <- major_yoy %>%
filter(flag) %>%
select(end_year, district_name, n_students, pct_change) %>%
mutate(pct_change = round(pct_change, 2))
if (nrow(major_flags) > 0) {
cat("Major districts with >5% year-over-year change:\n")
print(major_flags)
} else {
cat("No major districts had enrollment changes exceeding 5%.\n")
}Major District Trend Visualization
if (nrow(major_district_enr) > 0) {
ggplot(major_district_enr, aes(x = end_year, y = n_students, color = district_name)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
scale_y_continuous(labels = comma) +
scale_x_continuous(breaks = seq(min(major_district_enr$end_year),
max(major_district_enr$end_year), by = 1)) +
labs(
title = "Enrollment Trends: Major Pennsylvania Districts",
x = "End Year",
y = "Total Enrollment",
color = "District"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}Demographic Distribution Check
Verify that demographic breakdowns sum to approximately the total.
demo_check <- all_enr %>%
filter(
is_state | type == "Statewide",
grade_level == "TOTAL",
subgroup %in% c("total_enrollment", "white", "black", "hispanic",
"asian", "native_american", "pacific_islander", "multiracial")
) %>%
select(end_year, subgroup, n_students) %>%
tidyr::pivot_wider(names_from = subgroup, values_from = n_students) %>%
mutate(
demo_sum = rowSums(select(., -end_year, -total_enrollment), na.rm = TRUE),
diff = total_enrollment - demo_sum,
pct_diff = diff / total_enrollment * 100
)
demo_check %>%
select(end_year, total_enrollment, demo_sum, diff, pct_diff) %>%
mutate(pct_diff = round(pct_diff, 2))Grade Distribution Check
Verify grade-level enrollments sum to total.
grade_check <- all_enr %>%
filter(
is_state | type == "Statewide",
subgroup == "total_enrollment"
) %>%
select(end_year, grade_level, n_students) %>%
group_by(end_year) %>%
summarize(
total = sum(n_students[grade_level == "TOTAL"], na.rm = TRUE),
grade_sum = sum(n_students[grade_level != "TOTAL"], na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
diff = total - grade_sum,
pct_diff = diff / total * 100
)
grade_check %>%
mutate(pct_diff = round(pct_diff, 2))Missing Data Analysis
# Check for missing values in key columns
missing_summary <- all_enr %>%
filter(grade_level == "TOTAL", subgroup == "total_enrollment") %>%
summarize(
total_rows = n(),
missing_aun = sum(is.na(aun) | aun == ""),
missing_n_students = sum(is.na(n_students)),
missing_lea_name = sum(is.na(lea_name) | lea_name == ""),
missing_county = sum(is.na(county) | county == "")
)
missing_summaryDistrict Count by Year
Check that the number of districts remains relatively stable.
district_counts <- all_enr %>%
filter(
is_district,
subgroup == "total_enrollment",
grade_level == "TOTAL"
) %>%
group_by(end_year) %>%
summarize(
n_districts = n_distinct(aun),
.groups = "drop"
) %>%
arrange(end_year)
district_counts
if (nrow(district_counts) > 1) {
ggplot(district_counts, aes(x = end_year, y = n_districts)) +
geom_col(fill = "steelblue") +
scale_y_continuous(limits = c(0, NA)) +
labs(
title = "Number of Pennsylvania School Districts by Year",
x = "End Year",
y = "Number of Districts"
) +
theme_minimal()
}Data Quality Issues Summary
issues <- list()
# Check for large year-over-year changes
if (exists("state_yoy") && any(state_yoy$flag_large_change, na.rm = TRUE)) {
flagged_years <- state_yoy$end_year[state_yoy$flag_large_change & !is.na(state_yoy$flag_large_change)]
issues <- c(issues, paste("Large statewide enrollment changes in:", paste(flagged_years, collapse = ", ")))
}
# Check for demographic sum mismatches
if (exists("demo_check") && any(abs(demo_check$pct_diff) > 1, na.rm = TRUE)) {
bad_years <- demo_check$end_year[abs(demo_check$pct_diff) > 1 & !is.na(demo_check$pct_diff)]
issues <- c(issues, paste("Demographic sums don't match total in:", paste(bad_years, collapse = ", ")))
}
# Check for missing data
if (exists("missing_summary") && missing_summary$missing_n_students > 0) {
issues <- c(issues, paste("Missing enrollment counts:", missing_summary$missing_n_students, "records"))
}
# Check for district count variations
if (exists("district_counts") && nrow(district_counts) > 1) {
dc_range <- range(district_counts$n_districts)
if (dc_range[2] - dc_range[1] > 50) {
issues <- c(issues, paste("District count varies significantly:", dc_range[1], "to", dc_range[2]))
}
}
if (length(issues) > 0) {
cat("Data Quality Issues Found:\n")
for (i in seq_along(issues)) {
cat(paste0(i, ". ", issues[[i]], "\n"))
}
} else {
cat("No major data quality issues detected.\n")
}Session Info
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] paschooldata_0.1.0 testthat_3.3.2
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.39 desc_1.4.3 R6_2.6.1 fastmap_1.2.0
## [5] xfun_0.56 magrittr_2.0.4 cachem_1.1.0 knitr_1.51
## [9] htmltools_0.5.9 rmarkdown_2.30 lifecycle_1.0.5 cli_3.6.5
## [13] sass_0.4.10 pkgdown_2.2.0 textshaping_1.0.5 jquerylib_0.1.4
## [17] systemfonts_1.3.2 compiler_4.5.2 tools_4.5.2 ragg_1.5.1
## [21] brio_1.1.5 evaluate_1.0.5 bslib_0.10.0 yaml_2.3.12
## [25] jsonlite_2.0.0 rlang_1.1.7 fs_1.6.7