# Load the Local Package for Development
if (requireNamespace("devtools", quietly = TRUE)) {
devtools::load_all()
}
settings = get_settings(reload_settings = TRUE,
print = FALSE)
# Extract Commonly Used Paths and Labels from Settings
code_root <- get("code_root", settings)
work_dir <- get("working_dir", settings)
inputs_dir <- get("inputs_dir", settings)
outputs_dir<- get("outputs_dir", settings)
report_dir <- get("report_dir", settings)
popsim_initial_label <- get("popsim_initial_label", settings)
popsim_daypat_label <- get("popsim_daypat_label", settings)5 Geographic Data Preparation
The goal of this chapter is to line up survey data, Census controls, and study geographies so every downstream step (initial expansion, PopSim, rounds of weighting) has consistent keys, counts, and shapes.
- Load the project package and settings.
- Validate the presence and shape of source tables.
- Build a PUMA ↔︎ client-zone crosswalk that allocates block‐group counts proportionally.
- Save artifacts for the sample plan and make maps.
5.1 Load Packages, Settings, and Directory Paths
We start by loading the local package for development, the settings file, and extracting commonly used paths and labels from the settings.
Validate Datasets
Validation catches missing tables and relationship errors before they infect weights. We use RSG’s hts.iochecker (built on pointblank) to verify both schema (columns + types) and relationships (e.g., trips must belong to valid people/days).
It’s faster to run validation than to debug mismatches later in expansion or PopulationSim.
This first command loads the expected schema for HTS datasets.
# Default RSG Schema Coded into the IO Checker Package
schemas_list = hts.iochecker::load_rsg_schema(schema_set = 'default')
# Load Tables for Validation According to the Schema
tables_list = list()
table_names = intersect(names(schemas_list), names(get("hts_table_map", settings)))
for (table_name in table_names) {
tables_list[[table_name]] = fetch_hts_table(table_name, settings)
}Next, we validate the relationships between tables (e.g., every person belongs to only one household). The allowed_childless argument permits missing records for some relations, e.g., a person may have 0 days, or a day may have 0 trips.
hts.iochecker::validate_table_relation(
tables_list,
allowed_childless = c("day", "trip", "linked_trip", "tour")
)Finally, we validate each table against its expected schema (columns, types, labels).
# Results are assigned to suppress output of "TRUE" if passsed; set print to TRUE to not suppress outputs
print = FALSE
result <- hts.iochecker::validate_all_tables(tables_list, schemas_list)
if (print == TRUE) {
print(result)
}- Missing value_labels or mismatched codes (e.g., mode_type).
- Mis-named variables (e.g.,
res_typeinstead ofresidence_type) - Households outside the sample_plan geographic frame.
- Non-unique keys (hh_ids, trip_ids, especially when data are transferred without respecting special integer type values like
int64).
An additional step checks that “Monday” is coded as 1 in the travel_dow variable. This is important because the day-pattern model and PopulationSim configuration assume this coding.
if (
tables_list$value_labels[
variable == "travel_dow" &
stringr::str_detect(label, "Monday") &
value == 1,
.N
] == 0
) {
warning(
"Monday is not coded as 1 in the value_labels table. ",
"Check the `weight_dow_group` configuration in the settings file."
)
}Create Crosswalks
PUMA to Weighting Zone Groups
Prepare PUMA Spatial Data for Study Region and Zones
The weighting zone is the geographic unit used as the basis for weighting and expansion. Currently, this is PUMAs (Public Use Microdata Areas).
A weighting zone group combines multiple weighting zones to ensure reliable sample sizes for weighting. In the code, this is sometimes called a client zone (because it is often defined by the client).
This step establishes the link between Public Use Microdata Areas (PUMAs)—the smallest geography available in the Census microdata—and the zones used for weighting and expansion. Some clients define their own “custom zones,” based on MPOs, neighborhoods, cities.
The Puget Sound Regional Council (PSRC) weighting process keeps things simpler: zones are defined as groups of PUMAs that together approximate the region’s subareas used for modeling and reporting.
When weighting zones are directly built from PUMAs, no separate crosswalk is needed between client-defined geographies (e.g., cities, neighborhoods, MPOs) and PUMAs. Instead, the script identifies all PUMAs that intersect the study region boundary and treats those as the base units for target tabulation and sample expansion. The geometry preparation still matters, however—it ensures that PUMAs are valid, clipped to the study region, and aligned to a consistent coordinate reference system. This guarantees that when Census control totals are aggregated or survey records are assigned to zones, all spatial joins occur cleanly and without mismatched boundaries.
In other words, when zone_type = "pumas" in the configuration, every weighting zone corresponds either to a single PUMA or to a defined group of PUMAs combined later in the control data setup. This “crosswalk” step therefore functions mainly as a validation and alignment stage—confirming that the correct set of PUMAs is being used for the study region and that their identifiers match across all input datasets. This keeps the workflow flexible for future surveys while avoiding unnecessary complexity in the zone definitions.
# Get PUMA `sf` Data for Study
puma_sf = get_puma_geom(settings) %>%
# Concat PUMA from state and PUMA
dplyr::mutate(PUMA = paste0(
get(stringr::str_subset(names(.), "STATEFP")),
get(stringr::str_subset(names(.), "PUMACE")))
)
# Client zones: Use Client-Specific Zones if Provided, Otherwise Default to Study Region/PUMAs
study_region = fetch_study_region(settings) %>%
sf::st_transform(sf::st_crs(puma_sf)) %>%
sf::st_make_valid()
if (get("zone_type", settings) == "client_zones") {
client_zones_sf = fetch_hts_table("client_zones", settings)
if (is.null(client_zones_sf)) {
stop("Client zones not found! Must be created before running this script.")
}
# Get expected client zone IDs
expected_client_zone_ids = c("-1", unique(client_zones_sf$client_zone_id))
} else if (get("zone_type", settings) == "pumas") {
# Default to the study zone
client_zones_sf = study_region %>%
dplyr::mutate(client_zone_id = as.character(NA), client_zone = "study_region")
# Get PUMAs intersecting the study region
pumas = puma_sf[sf::st_intersects(study_region, puma_sf, sparse = FALSE), "PUMA"]
# Client zone IDs
expected_client_zone_ids = c('-1', as.vector(pumas$PUMA))
} else {
stop("Invalid zone_type. Must be 'client_zones' or 'pumas'.")
}
# Ensure Client Zones are Valid Spatial Data
client_zones_sf = client_zones_sf %>%
sf::st_as_sf() %>%
sf::st_make_valid() %>%
sf::st_transform(sf::st_crs(puma_sf)) %>%
sf::st_make_valid()Prepare Block Group Data
This step compiles American Community Survey (ACS) population and household counts at the block group level—the smallest geography with comprehensive demographic information. These data provide the population “weights” that underpin proportional reallocation between spatial units later in the weighting process. The script retrieves ACS estimates and geometries for all block groups in the study state, checks for missing values, and reshapes them into a reference table (bg_dt) keyed by block group GEOID.
Because block groups do not nest cleanly within PUMAs, the script builds a practical crosswalk that links each block group to a PUMA using tract-level relationships. This produces a one-to-many mapping in edge cases, where a single tract (and its block groups) overlaps multiple PUMAs. In those situations, population counts can later be apportioned proportionally when constructing the PUMA-to-zone crosswalk. The resulting dataset (bg_puma_sf) carries both geometry and identifiers, aligning ACS reference populations with the PUMA geography used for weighting.
PSRC uses groups of PUMAs as weighting zones, so this step is primarily about alignment and completeness—ensuring all block groups contributing to the study region are correctly tied to their parent PUMA(s), and that any overlapping areas are accounted for before aggregating control totals. This approach preserves the full population base while respecting the PUMA definitions that form the backbone of the weighting framework.
# Fetch Block Group Data and ACS Estimates for Population Counts
bg_ref = fetch_acs(
variables = settings$acs_count_vars,
settings = settings
)
# Download ACS Block Group Geometries for the State
state_fips = get_state_fips(settings)
acs_bg_sf = tidycensus::get_acs(
geography = "block group",
variables = settings$acs_count_vars,
year = settings$acs_year,
state = state_fips,
survey = settings$acs_dataset,
geometry = TRUE
)
# Extract Unique Block Groups
bg_sf = acs_bg_sf %>% dplyr::select(GEOID) %>% dplyr::distinct()
# Map the ACS Variables to the Total Name Without Converting to `data.table`
acs_bg_dt = acs_bg_sf %>% as.data.table()
# Check for Missing Values in ACS Data
if (any(is.na(acs_bg_dt[, .(variable, estimate)]))) stop("There are NAs in the ACS data")
# Reshape ACS Data to Wide Format for Reference Counts
bg_dt = dcast(
acs_bg_dt[, .(count = sum(estimate)), by = .(GEOID, variable)],
GEOID ~ variable,
value.var = "count"
)
setnames(bg_dt, c("person", "household"), c("ref_count_per", "ref_count_hh"), skip_absent = TRUE)Block Group to PUMA Crosswalk
# Join the BG to PUMA Using Tracts
# Map Block Groups to PUMAs Using Tracts (Efficient Mapping)
tract_puma = get_tracts_puma_xwalk(
settings,
puma_year = get("pums_year", settings),
tract_year = get("acs_year", settings)
)[, .(tract_id, PUMA)]
bg_dt[, tract_id := stringr::str_sub(GEOID, 1, 11)]
# Filter to Block Groups Within Study Region's PUMAs
bg_puma_dt = tract_puma[bg_dt, on = "tract_id"][!is.na(PUMA)]
# Join the Geometry to the BG-PUMA Crosswalk
bg_puma_sf = bg_sf %>%
dplyr::filter(GEOID %in% bg_puma_dt$GEOID) %>%
dplyr::left_join(bg_puma_dt, by = c("GEOID")) %>%
sf::st_make_valid()
# Confirm All Reference BGs are Present in the Crosswalk
stopifnot(all(bg_ref$GEOID %in% bg_puma_sf$GEOID))
stopifnot(!any(is.na(bg_puma_sf)))Block Group to PUMA to Zone Group Crosswalk
This step reconciles three different geographic layers that do not nest cleanly within one another: Census block groups, Public Use Microdata Areas (PUMAs), and (if applicable) client-defined weighting zones. Because PUMAs are drawn independently of tract or block group boundaries, a single block group can span multiple PUMAs or zones. To use Census population and household estimates as control totals for weighting, we must therefore define how each block group’s data should be shared among the intersecting areas.
The script accomplishes this by performing spatial intersections between block groups, PUMAs, and zones to produce a three-way crosswalk. Each record in this crosswalk represents a fragment of overlap among the three geographies. The area of the overlap is used as a proportional weighting factor, under the simplifying assumption that population is uniformly distributed within a block group. This allows ACS counts (population, households, etc.) to be re-expressed in the geography of PUMAs or client zones, even when the boundaries misalign.
Two approaches are supported:
- If, in settings,
largest_bg_to_client_zone = TRUE, each block group is assigned to the single zone where it overlaps most, simplifying later joins but ignoring smaller overlaps. This should only be set toTRUEif block groups perfectly nest within zones. - Otherwise, the script performs full area-weighted splitting, retaining all intersections and proportionally allocating reference counts by overlap area. Tiny overlaps below a user-defined
xwalk_sliver_threshold(specified in settings) are removed and proportions are renormalized so each block group’s shares sum to one.
The resulting dataset (bg_puma_czone_dt) serves as the spatial key that allows ACS-based controls to be aggregated to the zones used for weighting and enables survey records to be validated against the same geography.
For PSRC, which does not use custom zones, this crosswalk primarily ensures that ACS population totals are properly reconciled to the PUMA groupings defining PSRC’s weighting zones (groups of PUMAs). Because block groups and PUMAs overlap irregularly in the Puget Sound region, this proportional allocation step is essential—it guarantees that regional control totals remain internally consistent, even when geographic boundaries do not align exactly.
This step can be computationally intensive, especially for large regions with many overlapping geographies.
ref_count_cols = stringr::str_subset(names(bg_dt), "ref_count")
sliver_threshold = get("xwalk_sliver_threshold", settings)
xwalk_path <- file.path(get("working_dir", settings), "bg_puma_czone_xwalk.rds")
if (file.exists(xwalk_path)) {
bg_puma_czone_dt <- readRDS(xwalk_path)
} else {
# Set known constants in geometry object | This silences the warning:
# Warning message:
# attribute variables are assumed to be spatially constant throughout all geometries
client_zones_sf = client_zones_sf %>% st_set_agr("constant")
bg_puma_sf = bg_puma_sf %>% st_set_agr("constant")
study_region = study_region %>% st_set_agr("constant")
puma_sf = puma_sf %>% st_set_agr("constant")
# Create a three-way crosswalk (BGs, PUMAs, Client Zones) using spatial intersections
# Area proportionality is used for population allocation at the block group scale
# - Option to use the largest overlap with BG in the client zone or split overlapping BGs
if (get("largest_bg_to_client_zone", settings)) {
message("Using largest overlap of BGs with client zones")
# Suppressing warnings about assuming spatial constant attributes.
# This is because we are doing a many-to-many join. The warning is not easily avoidable.
suppressWarnings(
{
bg_puma_czone = sf::st_join(
bg_puma_sf,
client_zones_sf,
join = sf::st_intersects,
largest = TRUE
) %>%
sf::st_make_valid() %>%
dplyr::mutate(area = sf::st_area(geometry))
}
)
} else {
# Intersect BG-PUMA with client zones for inside-region mapping
bg_puma_czone_int = sf::st_intersection(client_zones_sf, bg_puma_sf) %>%
sf::st_make_valid() %>%
dplyr::mutate(area = sf::st_area(geometry))
# Identify BGs outside client zones
bg_puma_czone_out = sf::st_difference(bg_puma_sf, study_region) %>%
sf::st_make_valid() %>%
dplyr::mutate(
area = sf::st_area(geometry),
client_zone = "outside_region",
client_zone_id = -1
) %>%
dplyr::select(names(bg_puma_czone_int))
bg_puma_czone = rbind(bg_puma_czone_int, bg_puma_czone_out)
}
# Calculate population allocation proportional to intersected area; Use DT for speed
bg_puma_czone_dt = copy(bg_puma_czone) %>% setDT()
bg_puma_czone_dt[, bg_area := sum(area), by = GEOID]
bg_puma_czone_dt[, area_prop := as.numeric(area / bg_area), by = GEOID]
bg_puma_czone_dt[is.na(area_prop), area_prop := 0]
# Drop empty areas
bg_puma_czone_dt = bg_puma_czone_dt[area_prop > 0]
# check that area_prop preserves total area
stopifnot(
all.equal(
bg_puma_czone_dt[, sum(area_prop)],
bg_puma_czone_dt[, uniqueN(GEOID)]
)
)
# Remove spatial slivers (tiny overlaps) below threshold
# Set slivers to 0 and recalculate the area proportionality
if (!"xwalk_sliver_threshold" %in% names(settings)) {
stop("xwalk_sliver_threshold not found in settings. Please set this value.")
}
bg_puma_czone_dt[area_prop < sliver_threshold & area_prop > 0, area_prop := 0]
bg_puma_czone_dt[, area_prop := area_prop / sum(area_prop), by = GEOID]
# check that area_prop preserves total area
stopifnot(
all.equal(
bg_puma_czone_dt[, sum(area_prop)],
bg_puma_czone_dt[, uniqueN(GEOID)]
)
)
# Scale reference counts by area proportion
bg_puma_czone_dt[,
(ref_count_cols) := lapply(.SD, function(x) x * area_prop),
.SDcols = ref_count_cols
]
# If client zones are not provided, PUMAs become client zones
# but change name to differentiate trimmed areas
if (!all(get("zone_type", settings) == "client_zones")) {
# For client_zones in study region, rename client zones
bg_puma_czone_dt[
client_zone == "study_region",
client_zone_id := as.character(PUMA)
]
}
# Save this full outer crosswalk to RDS
saveRDS(
bg_puma_czone_dt,
file.path(get("working_dir", settings), "bg_puma_czone_xwalk.rds")
)
}Finalize the PUMA to Zone Crosswalk
This step turns the proportional shares you just computed into a spatially explicit crosswalk between PUMAs and weighting zones (or PUMA groups). First, it builds geometry for every PUMA–zone pair by intersecting puma_sf with client_zones_sf and tagging any remainder outside the study boundary as outside_region. It then joins the per-PUMA proportions (prop_*) from puma_czone_dt onto these geometries, yielding a single sf object (puma_czone_sf) that carries:
- The intersection polygon,
- The zone and PUMA IDs, and
- The household/person allocation shares.
If the project uses zone_type = "pumas" (no custom zones), the code relabels the in-region intersections so each PUMA appears as its own zone.
Robustness checks follow. The script verifies that proportions sum to 1 within each PUMA (both household and person), that all PUMAs and all expected zone IDs are present, and that any outside-region portion is reported (with percent of each PUMA clipped). This makes boundary effects explicit and prevents double counting when controls are later rolled to zones. The result is saved as puma_czone_xwalk.rds, which you’ll reuse downstream to:
- Visualize PUMA–zone coverage, and
- Apply consistent allocation factors whenever translating ACS/PUMS or survey tabulations between PUMA geography and the study’s weighting zones.
Even when zones are groups of PUMAs, this step is still crucial: it cleanly identifies any PUMAs that straddle the study boundary and encodes the exact in-region share to use when aggregating controls to PSRC’s zone groups, ensuring totals are internally consistent and cartographically accurate.
# Create Final Spatial Crosswalk (for Mapping and Allocation)
# Find the Intersection of PUMAs and Client Zones
puma_czone_sf_int <- sf::st_intersection(client_zones_sf, puma_sf) %>%
dplyr::select(PUMA, client_zone) %>%
sf::st_make_valid()
# Find Any PUMAs That Go Outside the Client Zones
puma_czone_sf_out <- sf::st_difference(puma_sf, study_region) %>%
dplyr::select(PUMA) %>%
dplyr::mutate(client_zone = "outside_region") %>%
sf::st_make_valid()
# Join the Population-Proportions to the PUMA-Zone Crosswalk
puma_czone_sf = puma_czone_dt %>%
dplyr::left_join(
rbind(puma_czone_sf_int, puma_czone_sf_out),
by = c("PUMA", "client_zone")
) %>%
sf::st_as_sf() %>%
dplyr::rename(puma_id = PUMA) %>%
sf::st_make_valid()
# If Client Zones are not Provided, Rename PUMAs as Client Zones, but Change Name to Differentiate Trimmed Areas
if (!all(get("zone_type", settings) == "client_zones")) {
# For client_zones in study region, rename client zones
puma_czone_sf = puma_czone_sf %>%
dplyr::mutate(
client_zone_id = dplyr::if_else(
client_zone == "study_region",
# as.character(seq_along(puma_id)),
as.character(puma_id),
client_zone_id
),
client_zone = dplyr::if_else(
client_zone == "study_region",
paste0(puma_id, "_", round(100 * prop_hh), "%"),
client_zone
),
)
}
# Check That PUMAs Sum to 1 Before Dropping Out of Region Parts
checksum = as.data.table(puma_czone_sf)[, lapply(.SD, sum), .SDcols = prop_cols, by = puma_id]
stopifnot(checksum[, all(all(abs(prop_hh - 1) < 0.0001), all(abs(prop_per - 1) < 0.0001))])
# Get Portion of PUMAs Outside the Fegion
outside_prop = as.data.table(puma_czone_sf)[
client_zone == "outside_region" & (prop_hh > 0 | prop_per > 0),
lapply(.SD, sum), .SDcols = c("prop_hh", "prop_per"),
by = puma_id
]
# Message the Portion of PUMAs Outside the Fegion
if (nrow(outside_prop) > 0) {
# Print the table
msg = paste0(
"PUMAs extend outside the region.\n",
"Note that this portion of the PUMAs will be trimmed off target counts.\n",
"If this is not expected, check the sample plan for troublesome",
"block groups or adjust the sliver threshold."
)
for (i in seq_len(nrow(outside_prop))) {
msg = paste0(
msg,
"\nPUMA: ", outside_prop$puma_id[i],
" | HH: ", round(outside_prop$prop_hh[i] * 100, 2),
"% | Per: ", round(outside_prop$prop_per[i] * 100, 2), "%"
)
}
message(msg)
}
# Drop Out Empty PUMAs
puma_zone_xwalk_sf = puma_czone_sf
# Confirm All PUMAs and Client Zones are Present in Final Crosswalk
stopifnot(all(puma_sf$PUMA %in% puma_zone_xwalk_sf$puma_id))
# Check That All PUMAs Are in the Crosswalk
stopifnot(all(puma_zone_xwalk_sf$puma_id %in% puma_sf$PUMA))
# Check that All Client Zones Are in the Crosswalk -- Include -1 for Outside Region
stopifnot(all(unique(puma_zone_xwalk_sf$client_zone_id) %in% expected_client_zone_ids))
# Check That the Proportions Sum to 1 for Each PUMA
checksum = as.data.table(puma_zone_xwalk_sf)[, lapply(.SD, sum), .SDcols = prop_cols, by = puma_id]
stopifnot(
all(checksum[, .(
prop_hh = all(abs(prop_hh - 1) < 0.00001),
prop_per = all(abs(prop_per - 1) < 0.00001)
)])
)
# Save to RDS
f_name = file.path(get("working_dir", settings), "puma_czone_xwalk.rds")
saveRDS(puma_zone_xwalk_sf, f_name)Map: Intersection of PUMAs and Client Zones
In the below map, each polygon shows where a PUMA overlaps a weighting zone, colored by zone label. Dark purple areas mark PUMA portions outside the study region. Hover to view PUMA ID and the share of its households and persons allocated to that zone.
5.2 Add Reference Counts to Sample Plan
This step enriches the sample plan with ACS-based reference counts and geography keys so later scripts can validate coverage and compute targets without re-joining large tables. We join the BG–PUMA–zone crosswalk to sample_plan by block group GEOID, attach ref_count_hh and ref_count_per, and carry over the PUMA id used throughout the weighting workflow. We also standardize the segment_name field (fixing misspellings and collapsing very small segments into broader categories). Consolidating sparse segments up front prevents unstable rates and makes downstream tabulations—like seed creation and expansion diagnostics—more reliable. The updated plan is written to inputs/pops/sample_plan.rds for reuse.
Why this matters: putting reference counts and consistent segment labels directly on the sample plan gives you a single, authoritative table for sampling decisions, quota checks, and diagnostics. It reduces join complexity later and ensures that every BG slated for sampling carries the same counts you’ll use for controls.
# Save Reference Counts to Sample Plan for Use in Later Scripts
sample_plan = fetch_hts_table("sample_plan", settings)
sample_plan[bg_puma_czone_dt, `:=`(
ref_count_hh = i.ref_count_hh,
ref_count_per = i.ref_count_per,
PUMA = i.PUMA
), on = c("bg_geoid" = "GEOID")]
setnames(sample_plan, old = "segment", new = "segment_name", skip_absent = TRUE)
## Fix Misspellings
sample_plan[, segment_name := str_replace(segment_name, "Unicorporated", "Unincorporated")]
sample_plan[, segment_name := str_replace(segment_name, "Pierce:Penrose", "Pierce County, Incorporated")]
## Upcode Segments with Less Than 10 Hhs
sample_plan[segment_name == "Pierce County, Incorporated-Rural" | segment_name == "Pierce County, Unincorporated-Rural", segment_name := "Pierce County, Rural (All)"]
pops_input_dir <- file.path(get("inputs_dir", settings), "pops")
sample_path <- file.path(pops_input_dir, "sample_plan.rds")
sample_path = file.path(get("inputs_dir", settings), "pops", "sample_plan.rds")
saveRDS(sample_plan, sample_path)5.3 Maps
**Map: Survey Sample SegmentsIn the map below, each color shows a survey segment formed from grouped block-group areas in the sample plan. Use this map to confirm that all sampled block groups are assigned and that segment boundaries look spatially coherent.
Figure: Weighting Zone Groups. Each color shows a weighting zone group used in the survey weighting process. Use this map to confirm that all zones are correctly defined and visually coherent.
Lastly, PUMAs are then assigned to their corresponding Weighting Zone Groups as defined in the project settings.
zone_groups <- get("zone_groups", settings)
grp_spec = lapply(zone_groups, function(x) data.table(client_zone_id = as.character(x)))
zone_groups_dt = rbindlist(grp_spec, idcol = "zone_group_label")
zone_groups_dt = merge(
puma_czone_sf,
zone_groups_dt,
by = 'client_zone_id',
all.y = TRUE
)5.4 Map: Weighting Zones and Weighting Zone Groups
Review the map below to determine if zone groups were defined as specified in the settings.