Skip to contents

Overview

This vignette performs data quality analysis on Pennsylvania enrollment data from the Pennsylvania Department of Education (PDE). We examine:

  1. Statewide enrollment time series (looking for anomalies)
  2. Year-over-year changes (flagging jumps > 5%)
  3. Major district enrollment trends
  4. Data quality issues discovered

Note: This vignette requires network access to fetch data from PDE. Run the code interactively to execute the analysis.

Data Collection

Fetch enrollment data for available years. PDE provides data from 2005-present, though format consistency varies.

# Attempt to fetch all available years
# Start with more recent years that have more consistent formatting
years_to_fetch <- 2015:2024

# Fetch data with error handling
all_enr <- tryCatch({
  fetch_enr_years(years_to_fetch)
}, error = function(e) {
  message("Error fetching some years: ", e$message)
  NULL
})

if (is.null(all_enr) || nrow(all_enr) == 0) {
  stop("Could not fetch any enrollment data. Check network connection and PDE website availability.")
}

# Report what years we got
years_fetched <- unique(all_enr$end_year)
message("Successfully fetched data for years: ", paste(years_fetched, collapse = ", "))

Statewide Time Series Analysis

Total Enrollment Trend

state_totals <- all_enr %>%
  filter(
    is_state | type == "Statewide",
    subgroup == "total_enrollment",
    grade_level == "TOTAL"
  ) %>%
  select(end_year, n_students) %>%
  arrange(end_year) %>%
  distinct()

# If no statewide rows exist, create from district sums
if (nrow(state_totals) == 0) {
  state_totals <- all_enr %>%
    filter(
      is_district,
      subgroup == "total_enrollment",
      grade_level == "TOTAL"
    ) %>%
    group_by(end_year) %>%
    summarize(n_students = sum(n_students, na.rm = TRUE), .groups = "drop") %>%
    arrange(end_year)
}

state_totals

Year-over-Year Changes

Flag any year with enrollment change > 5% as potentially anomalous.

state_yoy <- state_totals %>%
  arrange(end_year) %>%
  mutate(
    prev_year_enr = lag(n_students),
    change = n_students - prev_year_enr,
    pct_change = (n_students - prev_year_enr) / prev_year_enr * 100,
    flag_large_change = abs(pct_change) > 5
  )

# Display changes
state_yoy %>%
  select(end_year, n_students, change, pct_change, flag_large_change) %>%
  mutate(
    pct_change = round(pct_change, 2),
    change = format(change, big.mark = ",")
  )

Visualization

ggplot(state_totals, aes(x = end_year, y = n_students)) +
  geom_line(color = "steelblue", linewidth = 1) +
  geom_point(color = "steelblue", size = 2) +
  scale_y_continuous(labels = comma, limits = c(0, NA)) +
  scale_x_continuous(breaks = seq(min(state_totals$end_year), max(state_totals$end_year), by = 1)) +
  labs(
    title = "Pennsylvania Public School Enrollment",
    subtitle = paste("School Years", min(state_totals$end_year) - 1, "-", min(state_totals$end_year),
                     "through", max(state_totals$end_year) - 1, "-", max(state_totals$end_year)),
    x = "End Year (e.g., 2024 = 2023-24 school year)",
    y = "Total Enrollment"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Large Changes Analysis

large_changes <- state_yoy %>%
  filter(flag_large_change) %>%
  select(end_year, n_students, pct_change)

if (nrow(large_changes) > 0) {
  cat("Years with >5% enrollment change:\n")
  print(large_changes)
} else {
  cat("No years with enrollment changes exceeding 5%.\n")
}

Major District Analysis

Analyze enrollment trends for Pennsylvania’s largest urban districts.

Major District AUNs

Pennsylvania’s largest districts by enrollment:

District AUN
Philadelphia City SD 126515001
Pittsburgh SD 102027451
Allentown City SD 121390302
Reading SD 114067503
Erie City SD 105252602
# Define major district AUNs
major_districts <- tibble::tibble(
  aun = c("126515001", "102027451", "121390302", "114067503", "105252602"),
  district_name = c("Philadelphia", "Pittsburgh", "Allentown", "Reading", "Erie")
)

# Get district-level enrollment
district_enr <- all_enr %>%
  filter(
    is_district,
    subgroup == "total_enrollment",
    grade_level == "TOTAL"
  ) %>%
  select(end_year, aun, lea_name, n_students)

# Filter to major districts
major_district_enr <- district_enr %>%
  inner_join(major_districts, by = "aun") %>%
  select(end_year, district_name, lea_name, n_students) %>%
  arrange(district_name, end_year)

major_district_enr

Major District Year-over-Year Changes

major_yoy <- major_district_enr %>%
  group_by(district_name) %>%
  arrange(end_year) %>%
  mutate(
    prev_year_enr = lag(n_students),
    pct_change = (n_students - prev_year_enr) / prev_year_enr * 100,
    flag = abs(pct_change) > 5
  ) %>%
  ungroup() %>%
  filter(!is.na(pct_change))

# Show any large changes
major_flags <- major_yoy %>%
  filter(flag) %>%
  select(end_year, district_name, n_students, pct_change) %>%
  mutate(pct_change = round(pct_change, 2))

if (nrow(major_flags) > 0) {
  cat("Major districts with >5% year-over-year change:\n")
  print(major_flags)
} else {
  cat("No major districts had enrollment changes exceeding 5%.\n")
}

Major District Trend Visualization

if (nrow(major_district_enr) > 0) {
  ggplot(major_district_enr, aes(x = end_year, y = n_students, color = district_name)) +
    geom_line(linewidth = 1) +
    geom_point(size = 2) +
    scale_y_continuous(labels = comma) +
    scale_x_continuous(breaks = seq(min(major_district_enr$end_year),
                                     max(major_district_enr$end_year), by = 1)) +
    labs(
      title = "Enrollment Trends: Major Pennsylvania Districts",
      x = "End Year",
      y = "Total Enrollment",
      color = "District"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

Demographic Distribution Check

Verify that demographic breakdowns sum to approximately the total.

demo_check <- all_enr %>%
  filter(
    is_state | type == "Statewide",
    grade_level == "TOTAL",
    subgroup %in% c("total_enrollment", "white", "black", "hispanic",
                    "asian", "native_american", "pacific_islander", "multiracial")
  ) %>%
  select(end_year, subgroup, n_students) %>%
  tidyr::pivot_wider(names_from = subgroup, values_from = n_students) %>%
  mutate(
    demo_sum = rowSums(select(., -end_year, -total_enrollment), na.rm = TRUE),
    diff = total_enrollment - demo_sum,
    pct_diff = diff / total_enrollment * 100
  )

demo_check %>%
  select(end_year, total_enrollment, demo_sum, diff, pct_diff) %>%
  mutate(pct_diff = round(pct_diff, 2))

Grade Distribution Check

Verify grade-level enrollments sum to total.

grade_check <- all_enr %>%
  filter(
    is_state | type == "Statewide",
    subgroup == "total_enrollment"
  ) %>%
  select(end_year, grade_level, n_students) %>%
  group_by(end_year) %>%
  summarize(
    total = sum(n_students[grade_level == "TOTAL"], na.rm = TRUE),
    grade_sum = sum(n_students[grade_level != "TOTAL"], na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    diff = total - grade_sum,
    pct_diff = diff / total * 100
  )

grade_check %>%
  mutate(pct_diff = round(pct_diff, 2))

Missing Data Analysis

# Check for missing values in key columns
missing_summary <- all_enr %>%
  filter(grade_level == "TOTAL", subgroup == "total_enrollment") %>%
  summarize(
    total_rows = n(),
    missing_aun = sum(is.na(aun) | aun == ""),
    missing_n_students = sum(is.na(n_students)),
    missing_lea_name = sum(is.na(lea_name) | lea_name == ""),
    missing_county = sum(is.na(county) | county == "")
  )

missing_summary

District Count by Year

Check that the number of districts remains relatively stable.

district_counts <- all_enr %>%
  filter(
    is_district,
    subgroup == "total_enrollment",
    grade_level == "TOTAL"
  ) %>%
  group_by(end_year) %>%
  summarize(
    n_districts = n_distinct(aun),
    .groups = "drop"
  ) %>%
  arrange(end_year)

district_counts
if (nrow(district_counts) > 1) {
  ggplot(district_counts, aes(x = end_year, y = n_districts)) +
    geom_col(fill = "steelblue") +
    scale_y_continuous(limits = c(0, NA)) +
    labs(
      title = "Number of Pennsylvania School Districts by Year",
      x = "End Year",
      y = "Number of Districts"
    ) +
    theme_minimal()
}

Data Quality Issues Summary

issues <- list()

# Check for large year-over-year changes
if (exists("state_yoy") && any(state_yoy$flag_large_change, na.rm = TRUE)) {
  flagged_years <- state_yoy$end_year[state_yoy$flag_large_change & !is.na(state_yoy$flag_large_change)]
  issues <- c(issues, paste("Large statewide enrollment changes in:", paste(flagged_years, collapse = ", ")))
}

# Check for demographic sum mismatches
if (exists("demo_check") && any(abs(demo_check$pct_diff) > 1, na.rm = TRUE)) {
  bad_years <- demo_check$end_year[abs(demo_check$pct_diff) > 1 & !is.na(demo_check$pct_diff)]
  issues <- c(issues, paste("Demographic sums don't match total in:", paste(bad_years, collapse = ", ")))
}

# Check for missing data
if (exists("missing_summary") && missing_summary$missing_n_students > 0) {
  issues <- c(issues, paste("Missing enrollment counts:", missing_summary$missing_n_students, "records"))
}

# Check for district count variations
if (exists("district_counts") && nrow(district_counts) > 1) {
  dc_range <- range(district_counts$n_districts)
  if (dc_range[2] - dc_range[1] > 50) {
    issues <- c(issues, paste("District count varies significantly:", dc_range[1], "to", dc_range[2]))
  }
}

if (length(issues) > 0) {
  cat("Data Quality Issues Found:\n")
  for (i in seq_along(issues)) {
    cat(paste0(i, ". ", issues[[i]], "\n"))
  }
} else {
  cat("No major data quality issues detected.\n")
}

Session Info

## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] paschooldata_0.1.0 testthat_3.3.2    
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.39     desc_1.4.3        R6_2.6.1          fastmap_1.2.0    
##  [5] xfun_0.56         magrittr_2.0.4    cachem_1.1.0      knitr_1.51       
##  [9] htmltools_0.5.9   rmarkdown_2.30    lifecycle_1.0.5   cli_3.6.5        
## [13] sass_0.4.10       pkgdown_2.2.0     textshaping_1.0.5 jquerylib_0.1.4  
## [17] systemfonts_1.3.2 compiler_4.5.2    tools_4.5.2       ragg_1.5.1       
## [21] brio_1.1.5        evaluate_1.0.5    bslib_0.10.0      yaml_2.3.12      
## [25] jsonlite_2.0.0    rlang_1.1.7       fs_1.6.7