# Load the Local Package for Development
devtools::load_all()
# Load Settings; Pass python_env Explicitly for Quarto
settings = get_settings(reload_settings = TRUE,
print = FALSE)
# Extract Commonly Used Paths and Labels From Settings
code_root <- get("code_root", settings)
work_dir <- get("working_dir", settings)
inputs_dir <- get("inputs_dir", settings)
outputs_dir<- get("outputs_dir", settings)
report_dir <- get("report_dir", settings)
popsim_initial_label <- get("popsim_initial_label", settings)
popsim_daypat_label <- get("popsim_daypat_label", settings)7 Survey Data Preparation
What You’ll Do in This Chapter
In this chapter, we prepare the HTS dataset for weighting by removing incomplete households, assigning households to relevant geographies, imputing key demographic variables, and formatting the data for PopulationSim.
8 Chapter Setup
8.1 Load Packages, Settings, and Directory paths
First, we set up our environment by loading necessary packages, settings, and directory paths.
8.2 Load Survey and PUMS Data
Next, we load the survey data tables, along with PUMS data that will be used later during missing data imputation.
sample_plan = fetch_hts_table("sample_plan", settings)
households = fetch_hts_table("household", settings)
persons = fetch_hts_table("person", settings)
days = fetch_hts_table("day", settings)
trips = fetch_hts_table("trip", settings)
value_labels = fetch_hts_table("value_labels", settings)
variable_list = fetch_hts_table("variable_list", settings)
pums_tabbed = readRDS(file.path(work_dir, 'pums_tabbed.rds'))
pums_cleaned = readRDS(file.path(work_dir, 'pums_cleaned.rds'))
pums_codebook = read_pums_codebook(settings)
#FIXME: In Raw Data, First Column Name Should be "variable."
setnames(variable_list, old = "V1", new = "variable", skip_absent = TRUE)9 Remove Incomplete Households
RSG’s weighting process is configured to only weight households with complete data. These households are usually removed from the HTS dataset upstream of weighting, so this section serves as a safeguard.
A household is considered complete and eligible for weighting if the entire household provided complete data on at least one eligible day of the week.
9.1 For Person-Level Studies - Limit to 1 Participant Per Household
The next step checks if the study is person-level. If so, it ensures that only one participant per household is included in the dataset. If multiple participants are found in a household, an error is raised, prompting the user to decide how to handle this case. Additionally, households with no participating persons are flagged with a warning. Finally, only the participating persons are retained in the dataset, and households without any participants are dropped.
PSRC is a household-level study, so this code block will not execute.
if (get("study_unit", settings) == "person") {
# If person level study then person #1 is the only person in the household
# to be weighted.
n_participants = persons[is_participant == 1, .N, by = hh_id]
# Check if there are any households with more than one person that took the survey
if (any(n_participants[, N > 1])) {
# You will need to decide how to handle this case.
# Likely split each person into their own household
msg = stringr::str_glue(
"Person level study but {sum(multiple_participants)} households have >1 participating persons"
)
stop(msg)
}
# Check if any households have no participating persons
if (any(n_participants[, N < 1])) {
msg = stringr::str_glue(
"Person level study but {sum(zero_participants)} households have no participating persons"
)
warning(msg)
}
# Keep only the person who took the survey
persons = persons[is_participant == 1]
# Drop households with no participating persons
households = households[hh_id %in% persons$hh_id]
}9.2 Drop Households with Incomplete/No Study Days
These households should not be in the dataset at this point but it can happen with “messy” completes (e.g., if completion status is manually set for some households). This code identifies weighted weekdays (day_ints) and keeps only households that have at least one complete study day on those weekdays.
day_ints = unlist(settings$weight_dow_groups, use.names = FALSE)
hh_with_study_days = days[
is_complete == 1 &
hh_day_complete == 1 &
travel_dow %in% day_ints,
unique(hh_id)
]
households = households[hh_id %in% hh_with_study_days]9.3 Set Correct Block Group Field
Next, if use_reported_home is true in the settings, households are assigned to the block group corresponding to their reported home location (rather than their sampled home location). The appropriate block group field (2010 or 2020) is selected based on the ACS year specified in the settings. The code then creates a new column bg_geoid in the households data table, which contains the block group geoid used for sample segment assignment. Finally, the block group geoid is padded to ensure it is 12 digits long.
Why use reported home location instead of sampled home location?
Across RSG’s studies, approximately 2–5% of households report a home address that differs from the sampled address to which the invitation was sent. In many cases, the reported home address is in a different weighting zone or block group from the sampled home address.
RSG tracks sampled home locations during fieldwork to monitor response rates and adjust sample plans. However, using the reported home location provides a more accurate representation of where the household actually resides. This is especially important for weighting and segment assignment, as it ensures households are categorized based on their true geographic context, which can influence demographic characteristics and travel behavior.
# Regex if 2020 or 2010 home_bg_201x or home_bg_202x
regex = ifelse(settings$acs_year < 2020, "^home_bg_201[0-9]", "^home_bg_202[0-9]")
home_bg_field = str_subset(names(households), regex)
# Get ABS Sample Segments Based on Block Group of Home Location
if (get("use_reported_home", settings)) {
households[, home_bg := get(home_bg_field)]
households[, bg_geoid := get(home_bg_field)]
} else {
# Use sample home location if it is available
households[, home_bg := get(home_bg_field)]
households[, bg_geoid := fifelse(is.na(sample_home_bg), get(home_bg_field), sample_home_bg)]
}
# Assign Block Group to Persons Based on Household bg_geoid
persons[
households,
`:=`(
bg_geoid = i.bg_geoid,
home_bg = i.home_bg
),
on = "hh_id"
]
persons[is.na(bg_geoid), bg_geoid := home_bg]
persons[, bg_geoid := str_pad(bg_geoid, width = 12, pad = 0)]
# Drop Households That don't Have a BG in the Sample Plan
households[, bg_geoid := str_pad(bg_geoid, width = 12, pad = 0)]9.4 Drop Households Not in Region From All Datasets
In this step, households that are not located within the defined study region (i.e., their block group geoid is not present in the sample plan) are removed from the dataset. Typically, upstream data processing removes these households before they reach this stage, but this code serves as a safeguard to ensure that only relevant households are retained for weighting.
cat("Dropping ", nrow(households[!bg_geoid %in% sample_plan$bg_geoid]), " households not in region")Dropping 0 households not in region
# Drop Households Not in Region, Cascade to Persons, Day, Trips
households = households[bg_geoid %in% sample_plan$bg_geoid]
persons = persons[hh_id %in% households$hh_id]
days = days[hh_id %in% households$hh_id]
trips = trips[hh_id %in% households$hh_id]9.5 Run Data Check
These checks ensure that all households are complete, located within the study region, and present in the sample plan. Additionally, they verify that the households and persons tables are symmetric, meaning that every household has corresponding persons and vice versa.
# Check for Any Incomplete Households
stopifnot(nrow(households[is_complete != 1]) == 0)
# Check for Households Outside the Region
stopifnot(nrow(households[home_in_region == 0]) == 0)
# Check for Households not in Sample Plan
stopifnot(nrow(households[!bg_geoid %in% sample_plan$bg_geoid]) == 0)
# Ensure the Tables are Symmetric
stopifnot(nrow(persons[!hh_id %in% households$hh_id]) == 0)
stopifnot(nrow(households[!hh_id %in% persons$hh_id]) == 0)10 Assign Sample Segment Using Reported Home Block Group
This section deals with sample segment assignment, preparing households for base weight estimation.
Sample segments are geographic strata used to structure survey sampling and track response rates. Assigning each household to the correct segment ensures that sampling effort and observed response rates are accurately reflected in the weighting process. The code below assigns sample segments to households based on their reported home block group geoid, using the provided sample plan. It also handles special cases for supplemental segments (e.g., non-probability segments) if specified in the settings.
Why does segment assignment matter for weighting? Base weights in survey sampling adjust for the probability that each unit (person, household, etc.) was selected. If sampling effort is stratified by segment (e.g., urban/rural, age group), each segment may have a different chance of being sampled and a different response rate.
Constructing base weights using observed response rates within each segment ensures that survey results are representative of the target population. Segments with lower response rates get higher weights, compensating for underrepresentation. This corrects for sampling bias and allows valid population estimates.
First, persons are assigned to block groups based on their household’s block group geoid.
# Used to Keep Track of the Number of Households or Persons in the Final Seed as it Gets Processed
expected_hh = nrow(households)
expected_per = nrow(persons)10.1 Join Segment Label to Households Table
We start with the original sample segment from the households table.
# Add _orig Suffix Because we are Going to Add New Cols From sample_plan
setnames(households, "sample_segment", "segment_num", skip_absent = TRUE)
value_labels[, value := as.integer(value)]
segments = value_labels[variable == "sample_segment", .(segment_num = value, label)]
households[segments, segment_name := i.label, on = "segment_num"]10.2 Set Invitation Column
All households are presumed to be invited via address-based sampling (ABS), unless non-probability segments (NPS)are specified in the settings. If NPS segments are defined, households in those segments are marked as “NPS” invitations, while all others remain “ABS.”
PSRC does not have NPS segments, so all households will be marked as “ABS.”
NPS households may require different treatment due to their non-probability sampling method, especially if they comprise a large (more than ~10%) share of the overall sample. For surveys with a large NPS segment, or if desired, RSG can “blend” ABS and NPS samples using the functions blend_initial_weights() and calc_alpha() during base weight estimation. These functions leverage modern composite weighting methods (See: Fahimi et al. 2024)[https://publichealth.jmir.org/2024/1/e48186/]. For surveys with small NPS segments, RSG typically treats them as part of the ABS sample for weighting purposes.
if ("nps_segments" %in% names(settings) && !is.null(settings$nps_segments)) {
nps_segments = value_labels[
variable == "sample_segment" &
label %in% settings$nps_segments,
]
nps_seg_num = unique(nps_segments$value)
stopifnot(length(nps_seg_num) > 0)
households[!segment_num %in% nps_seg_num, invitation := "ABS"]
households[segment_num %in% nps_seg_num, invitation := "NPS"]
} else {
households[, invitation := "ABS"]
}
stopifnot(households[is.na(invitation), .N] == 0)10.3 Assign Segment by Sample Plan
Households are assigned to sample segments based on the reported home block group geoid (rather than sampled block group geoid) using the sample plan. This ensures that each household is correctly categorized according to its geographic segment for weighting purposes.
# Joining Households Table to Sample Plan for ABS Segments on bg_geoid
# Enforce 12 Digit Geoids
sample_plan[, bg_geoid := str_pad(bg_geoid, width = 12, pad = 0)]
# Assert That Each BG is Represented Once
stopifnot(!any(sample_plan[, .N > 1, keyby = bg_geoid]$V1))
# Even Though Segment Should be in there Already, we do this Because:
# - Confirms based on home location, not sampled address location.
# - Handles the case where the segment is missing.
# - Enforces consistent sample segment names.
households[, segment_name_orig := segment_name]
# Assign Segment Name from Sample Plan not what was Reported
households[
sample_plan,
segment_name := i.segment_name,
on = .(bg_geoid)
]Next, if NPS segments are defined in the settings, households in those segments are assigned a supplemental segment name based on their segment number.
# Add Supplemental Segment for NPS Segments
if ("nps_segments" %in% names(settings) && !is.null(settings$nps_segments)) {
households[
nps_segments,
segment_name := i.label,
# Do this if you want a single segment name for all NPS segments
# segment_name := "supplemental",
on = .(segment_num = value)
]
}Checks are run to ensure that all households have been assigned a sample segment, that all household block group geoids are present in the sample plan, and that each block group belongs to only one segment.
# Make Sure All the Households Have a Sample Segment
stopifnot(households[is.na(segment_name), .N] == 0)
# Make Sure All the Household geoids are in the Sample Plan
stopifnot(
all(households[invitation == "ABS", bg_geoid] %in% sample_plan$bg_geoid)
)
# Make Sure Each BG Belongs to only One Segment
stopifnot(
all(
households[invitation == "ABS", .N, keyby = .(bg_geoid, segment_name)][,
.N, keyby = bg_geoid]$N == 1
))Segment information is then added to the persons table based on their household’s segment assignment.
# Add Segment Information to Persons Table for University Sample
persons[
households,
`:=`(
invitation = i.invitation,
segment_name = i.segment_name
),
on = "hh_id"
]10.4 Table: Households by Segment - Before and After Re-Assignment
This table summarizes the number of households in each segment before and after reassignment based on block group geoid. It provides a comparison of household counts to illustrate how segment assignments have changed. Large changes may indicate issues with segment definitions or sample plan accuracy.
Segment | Before Re-assignment | After Re-assignment |
|---|---|---|
King County-General | 460 | 456 |
King County-Hard-to-survey | 245 | 250 |
King County-Rural | 79 | 76 |
King County-Walk/Bike/Transit | 385 | 389 |
Kitsap County-General | 78 | 81 |
Kitsap County-Hard-to-survey | 14 | 14 |
Kitsap County-Rural | 44 | 43 |
Pierce County, Incorporated-General | 116 | 120 |
Pierce County, Incorporated-Hard-to-survey | 45 | 48 |
Pierce County, Rural (All) | 490 | 483 |
Pierce County, Unincorporated-General | 495 | 498 |
Pierce County, Unincorporated-Hard-to-survey | 121 | 116 |
Snohomish County-General | 231 | 231 |
Snohomish County-Hard-to-survey | 56 | 55 |
Snohomish County-Rural | 51 | 50 |
Total | 2,910 | 2,910 |
12 Assign Households to Weighting Zones (PUMAs)
RSG matches the geographic structure of the survey data to that of the weighting targets by assigning households and persons to Public Use Microdata Areas (PUMAs). This section assigns PUMA IDs to households and persons based on their block group geoid, which is essential for aligning the survey data with census-based weighting targets.
12.1 Append PUMA IDs to Households/Persons Data
To kick off weighting zone group assignment - which is based on PUMA geographies - households and persons are assigned PUMA IDs based on their block group geoid. If a household’s block group geoid is missing, the home block group is used as a fallback. The corresponding census tract ID is derived from the block group geoid, and a crosswalk is used to map tracts to PUMAs. Finally, the PUMA ID is added to both the households and persons tables.
households[is.na(bg_geoid), bg_geoid := home_bg]
households[, tract_id := str_sub(bg_geoid, 1, 11)]
tracts_to_pumas = get_tracts_puma_xwalk(
settings,
puma_year = settings$pums_year,
tract_year = settings$acs_year
)
households[
tracts_to_pumas,
puma_id := i.PUMA,
on = .(tract_id)
]
households[is.na(puma_id), puma_id := home_puma_2022]
persons[households, puma_id := i.puma_id, on = "hh_id"]
# h_puma ------------------------------
households[, h_puma := puma_id]
# p_puma ------------------------------
persons[, p_puma := puma_id]13 Prepare Key Variables: Income, Household Size
This section serves as a safeguard to ensure that key variables required for weighting, specifically household income and household size, are correctly prepared and consistent with the requirements of RSG’s weighting process.
This code will likely be deprecated in the future as these variables are typically prepared upstream of weighting. A patch to hts.iochecker schema is the preferred solution to ensure these variables are correctly formatted before reaching this stage.
13.1 Assign Broad Income Categories
Household income is a key variable for weighting and analysis. This section assigns broad income categories to households based on their reported income. Helper function get_income_broad() is concatenates detailed income responses (income_detailed) with non-response followup responses.
# If income_followup is Missing, use income_broad (legacy)
if (!"income_followup" %in% value_labels$variable) {
inc_broad_labs = copy(value_labels[variable == 'income_broad', ])
inc_broad_labs[, variable := 'income_followup']
value_labels = rbind(value_labels, inc_broad_labs)
}
if(!"income_followup" %in% names(households)){
households[, income_followup := income_broad]
}
# Recode 995 to 999 so that Income Imputation Catches it
households[income_followup == 995, income_followup := 999L]
# Label Bins and Top Code if Necessary
households = get_income_broad(households, value_labels, settings)
# Assert no Missing Labels
stopifnot(all(!is.na(households$income_broad)))13.2 Calculate Household Size
Household size is calculated upstream of the weighting process, but here we check that it is consistent with the number of persons in each household. If the study unit is household, we count the number of persons per household and compare it to the num_people field in the households table. If there are discrepancies (which may occur due to top-coding of household size), they are reported for review.
if (get("study_unit", settings) == "household") {
hh_size = persons[, .N, keyby = hh_id]
# Check that hh_size is consistent with the number of persons in the household
households[, num_people_orig := num_people]
households[hh_size, num_people := i.N, on = "hh_id"]
inc_hhs <- nrow(households[num_people != num_people_orig])
cat("Households with inconsistent household size: ", inc_hhs)
}Households with inconsistent household size: 0
13.3 Run Checks
Here, we check that data preparation steps have preserved the expected number of households and persons. We calculate the total number of unique households and persons in the dataset and compare them to the expected counts stored earlier. If there are discrepancies, an error is raised to alert the user.
# Check Unique Household and Person Counts
households[, h_total := .N, by = hh_id]
persons[, p_total := .N, by = person_id]
stopifnot(households[h_total != 1, .N] == 0)
stopifnot(persons[p_total != 1, .N] == 0)
# Check that Totals Have Not Changed
stopifnot(nrow(households) == expected_hh)
stopifnot(nrow(persons) == expected_per)13.4 Sort and Save Cleaned Household and Persons Data
households = households[order(hh_id)]
persons = persons[order(person_id)]
saveRDS(households, file.path(work_dir, 'households_survey_cleaned.rds'))
saveRDS(persons, file.path(work_dir, 'persons_survey_cleaned.rds'))14 Survey Data Imputation
The income, gender, and ethnicity questions in the survey allowed participants to respond with “prefer not to answer.” Additionally, non-related household members (i.e., roommates) were not required to supply their gender, race, or ethnicity; and children were not required to provide their race or ethnicity. To facilitate data weighting, imputed missing values for these variables are created when a participant was not required to answer or selected “prefer not to answer.”
14.1 Income Imputation
This process imputed income using an ordinal regression model. Specifically, using a proportional odds logistic regression model (POLR), which is designed for predicting ordinal outcomes (appropriate for income brackets ). The model estimates the probability that an observation falls at or below a particular income bracket, based on the values of the predictor variables. Missing income was predicted based on a set of independent variables including:
- Income distribution of the block group based on ACS 2022 5-year data, with the median income ($75-100K) used as a reference level.
- Number of non-working adults in the household.
- Number of children in the household.
- Employment status of household members.
- Educational attainment of household members.
- Age of the primary survey respondent.
- Whether the home is owned by the household.
- Whether the household is a single-family home.
The function impute_income_nonrelatives imputes household income for households with nonrelatives using PUMS and survey variables. This is implemented if impute_unrelated_income is TRUE in the settings. A user might wish to use this functionality if unrelated householders were specifically excluded from the household income question.
This function is not typically applied but is retained here for backwards compatibility.
if (any(stringr::str_detect(names(settings$targets), "income"))) {
# Income PNTA ------------------------------------------------------------
hh_income_pnta = impute_income_pnta(
households = households,
persons = persons,
value_labels = value_labels,
settings = settings
)
## Nonrelatives' Income ----------------------------------------------------
hh_income_imputed_nr = impute_income_nonrelatives(
persons = persons,
pums_cleaned = pums_cleaned,
hh_income_pnta = hh_income_pnta,
codebook = pums_codebook,
value_labels = value_labels,
settings = settings
)
## Join imputed income back to households ---------------------------------
households[hh_income_imputed_nr, `:=`(
income_imputed = i.income_imputed,
income_imputed_label = i.income_imputed_label
), on = "hh_id"]
label_df = unique(households[, .(income_imputed_label)])
label_df[
,
income_imputed_value := get_income_bounds(income_imputed_label, direction = 'upper'),
]
households[
label_df,
income_imputed_value := i.income_imputed_value,
on = .(income_imputed_label)
]
}14.1.1 Table: Income imputation model summary
Here, we show the summary of the income imputation model. Significant coefficients (low p-values) indicate reliable predictors (often things like education or homeownership) while others may vary by dataset or model run. Coefficient signs should align with expectations (e.g., higher education -> higher income). McFadden’s R² values around 0.1–0.3 generally indicate a good fit for behavioral or survey models, while values below 0.1 suggest weak fit and above 0.3 are unusually strong. Variable significance can shift between runs – interpret the overall direction and pattern of effects rather than any single estimate.
Parameter | Description | Estimate | Std Error | T-Statistic | p.value |
|---|---|---|---|---|---|
f_0_24999 | -0.022 | 0.600 | -0.04 | 0.97 | |
f_25000_49999 | 0.884 | 0.650 | 1.36 | 0.173 | |
f_50000_74999 | 0.806 | 0.656 | 1.23 | 0.219 | |
f_100000_199999 | 1.700 | 0.543 | 3.13 | 0.002 | |
f_200000_plus | 3.987 | 0.496 | 8.03 | <0.001 | |
nonworking_adult_n | 0.301 | 0.058 | 5.19 | <0.001 | |
child_n | 0.013 | 0.054 | 0.24 | 0.807 | |
ft_grad_n | 2.243 | 0.104 | 21.48 | <0.001 | |
pt_grad_n | 0.897 | 0.212 | 4.23 | <0.001 | |
ft_bachelors_n | 1.808 | 0.096 | 18.76 | <0.001 | |
pt_bachelors_n | 0.417 | 0.179 | 2.33 | 0.02 | |
ft_no_college_n | 1.088 | 0.088 | 12.31 | <0.001 | |
pt_no_college_n | 0.054 | 0.142 | 0.38 | 0.703 | |
head_under_35_n | -0.014 | 0.104 | -0.13 | 0.894 | |
head_65_plus_n | -0.124 | 0.104 | -1.19 | 0.234 | |
own_home | 1.151 | 0.099 | 11.66 | <0.001 | |
single_family_home | 0.351 | 0.095 | 3.68 | <0.001 | |
McFadden's rho-squared: 0.17 | |||||
14.2 Gender Imputation
The process probabilistically assigned missing gender responses using a Monte Carlo procedure based on the sample data’s gender distribution within the respondent’s age category.
if (any(stringr::str_detect(names(settings$targets), "gender"))) {
gender_dt = impute_gender(
persons = persons,
value_labels = value_labels,
report_dir = report_dir,
outputs_dir = outputs_dir,
seed = get('rng_seed', settings),
settings = settings
)
if ("gender" %in% names(persons)) {
persons[, gender_orig := gender]
}
# Join imputed gender back to persons
persons[gender_dt, gender_imputed := i.gender_imputed, on = .(person_id)]
}14.2.1 Table: Gender Distribution: Original (Relabeled) vs Imputed
Gender distribution is summarized here before and after imputation, broken out by adults and children. This allows for comparison of counts to see how imputation affected the gender breakdown. Large changes may indicate issues with the imputation model or assumptions.
age | gender | num_before_imputation |
|---|---|---|
Adult | Man/Boy (cisgender or transgender) | 2,291 |
Adult | Non-binary/Something else fits better | 65 |
Adult | Prefer not to answer | 418 |
Adult | Woman/Girl (cisgender or transgender) | 2,392 |
Child | Man/Boy (cisgender or transgender) | 397 |
Child | Non-binary/Something else fits better | 6 |
Child | Prefer not to answer | 87 |
Child | Woman/Girl (cisgender or transgender) | 381 |
age | gender | num_after_imputation |
|---|---|---|
Adult | female | 2,639 |
Adult | male | 2,527 |
Child | female | 423 |
Child | male | 448 |
14.3 Race and Ethnicity Imputation
The imputation process probabilistically assigned missing race and ethnicity responses using a Monte Carlo procedure based on the ACS data’s race and ethnicity distribution within the respondent’s reported home block group.
# If Race is not in Targets, Skip Imputation
if (any(stringr::str_detect(names(settings$targets), "race"))) {
race_dt = impute_race(
persons = persons,
variable_list = variable_list,
settings = settings
)
if ("race" %in% names(persons)) {
persons[, `:=`(race_orig = race, race = NULL)]
}
# Join imputed race back to persons
persons[race_dt, `:=`(race = i.race, race_imputed = i.race_imputed), on = "person_id"]
}
# Impute Ethnicity if it is in Targets
if (any(stringr::str_detect(names(settings$targets), "ethnicity"))) {
eth_dt = impute_ethnicity(
persons = persons,
variable_list = variable_list,
settings = settings
)
if ("ethnicity" %in% names(persons)) {
persons[, `:=`(ethnicity_orig = ethnicity, ethnicity = NULL)]
}
persons[eth_dt, ethnicity_imputed := i.ethnicity_imputed, on = "person_id"]
}14.3.1 Table: Race Distribution: Original (Relabeled) vs Imputed (Counts and Percentages)
adult | Race | Original (relabeled) | Imputed | Pct Original | Pct Imputed |
|---|---|---|---|---|---|
Adult | afam | 107 | 134 | 2.1% | 2.6% |
Adult | asian_pacific | 488 | 555 | 9.4% | 10.7% |
Adult | missing | 599 | 0 | 11.6% | 0.0% |
Adult | other | 342 | 454 | 6.6% | 8.8% |
Adult | white | 3,630 | 4,023 | 70.3% | 77.9% |
Child | afam | 0 | 50 | 0.0% | 5.7% |
Child | asian_pacific | 0 | 99 | 0.0% | 11.4% |
Child | missing | 871 | 0 | 100.0% | 0.0% |
Child | other | 0 | 175 | 0.0% | 20.1% |
Child | white | 0 | 547 | 0.0% | 62.8% |
14.3.2 Table: Imputation Rates of Race and Ethnicity Across Weighting Zone Groups
Weighting Zone Group | Pct Imputed Race | Pct Imputed Ethnicity |
|---|---|---|
King County - Seattle | 18.1% | 18.6% |
King County - Other | 27.1% | 28.5% |
Kitsap County - Expanded | 23.7% | 26.1% |
Pierce County | 26.8% | 28.4% |
Snohomish County | 21.5% | 22.8% |
14.4 Save Imputed Data
saveRDS(households, file.path(work_dir, 'households_survey_imputed.rds'))
saveRDS(persons, file.path(work_dir, 'persons_survey_imputed.rds'))15 Format Survey Data for PopulationSim
15.1 Configure Target Variables
The final step in survey data preparation is generating survey data tables in PopulationSim input format. Using the processed survey household and person files, this step prepares a PopulationSim “seed” or incidence table and tabulated summaries at the level of weighting zone groups.
# Load Data =====================================================================
sample_plan = fetch_hts_table("sample_plan", settings)
value_labels = fetch_hts_table("value_labels", settings)
days = fetch_hts_table("day", settings)
trips = fetch_hts_table("trip", settings)
households_raw = fetch_hts_table("household", settings)
households = readRDS(file.path(work_dir, 'households_survey_imputed.rds'))
persons = readRDS(file.path(work_dir, 'persons_survey_imputed.rds'))
target_puma = readRDS(get("target_puma_path", settings))
target_czones = readRDS(get("target_czone_path", settings))
target_region = readRDS(get("target_region_path", settings))
# Prepare & Tabulate Targets ===================================================
# This tabulates the prepared data into seed targets for the weighting process
# In other words, it converts the targets to one-hot encoding (wide format)
seed = prepare_targets(
households,
persons,
value_labels,
settings
)
seed[households, puma_id := i.puma_id, on = "hh_id"]
seed[, region := 1]
seed[, h_total := 1]15.2 Append Sample Segments and Weighting Zones to Seed
This step appends segment names and client zones to a “seed” dataset by joining with household and sample plan data. It first assigns segment-related fields from households, ensuring every record has a segment name. Then, depending on the zone type in the settings, it adds weighting zone information either directly from the sample plan or by spatially joining household coordinates to weighting zones. If client zones are not used, it assigns PUMA IDs as the weighting zone IDs instead of specified client zones.
# Add the Segment ----------------------------------------------
seed[households, `:=`(
bg_geoid = i.bg_geoid,
segment_name = i.segment_name,
invitation = i.invitation
),
on = "hh_id"
]
stopifnot(seed[is.na(segment_name), .N] == 0)
# Add Client Zones ------------------------------------------------
# If Using Client Zones, Add Them to the Seed. Else Use PUMAS as Client Zones
if (
get("zone_type", settings) == "client_zones"
) {
# If client zone is defined in the sample plan, use that, otherwise use lat/lon
if ("client_zone" %in% names(sample_plan)) {
seed[
sample_plan, `:=`(
client_zone = i.client_zone,
client_zone_id = i.client_zone_id
),
on = "bg_geoid"
]
} else {
# Assign client zone by lat/lon point
client_zones = fetch_hts_table("client_zones", settings)
points_sf = st_as_sf(
households[, .(hh_id, home_lat, home_lon)],
coords = c("home_lon", "home_lat"),
crs = 4326
)
client_zones_sf = client_zones %>% sf::st_transform(4326)
seed_czones = sf::st_join(points_sf, client_zones_sf, join = st_nearest_feature)
# Join client zones to seed on block group using the crosswalk
seed[
as.data.table(seed_czones),
`:=`(
client_zone = i.client_zone,
client_zone_id = i.client_zone_id
),
on = "hh_id"
]
}
} else {
seed[, client_zone_id := puma_id]
}15.3 Format Variables for PopulationSim
Next, variables are prepared and flattened variables in the “seed” dataset for use in PopulationSim. It identifies household-level, person-level, and additional info columns. If the study unit is “household,” it merges household data with aggregated person-level variables for each household; otherwise, it uses the seed data as-is. Finally, it sets the column order for the flattened dataset, ensuring household and person variables are organized appropriately.
# Flatten Household/Person Variables ===========================================
hh_vars = str_subset(names(seed), "^h_")
per_vars = str_subset(names(seed), "^p_")
info_cols = str_subset(names(seed), "^(?!h_|p_|hh_id|person_id).*$")
if (get("study_unit", settings) == "household") {
# Grab first person in household for household level data
seed_flat = merge(
seed[, .SD[1], by = hh_id, .SDcols = c(info_cols, hh_vars)],
seed[, lapply(.SD, sum), by = hh_id, .SDcols = per_vars],
by = "hh_id"
)
} else {
seed_flat = seed
}
setcolorder(
seed_flat,
c(
'hh_id', 'puma_id', "client_zone_id", #'initial_weight',
setdiff(hh_vars, 'h_total'),
'h_total',
setdiff(per_vars, 'p_total'),
'p_total'
)
)15.4 Prepare Seed Data for Day Groups
This code prepares seed data for day groups by reshaping household-day records so each household and day group combination becomes a separate row. It assigns day group labels based on travel days, removes records without valid day groups, and pivots the data to wide format with indicators for each day group. The code then merges this with the flattened seed table, creates compound household-day IDs, and calculates person counts per day group. It includes checks for expected day group counts and completeness, stopping execution if missing or invalid day groups are detected.
# If Multiple Day Groups, Reshape Seed so Each Hh Day is a Separate Row
# Convert day_groups into a `data.table`
day_groups_dt = get_day_groups(settings)
day_groups = unique(day_groups_dt$day_group)
# Create a Unique List of Days for Each Household in the Seed Table
hh_day = unique(days[hh_id %in% seed$hh_id & hh_day_complete == 1, .(hh_id, travel_dow)])
hh_day[day_groups_dt, day_group := i.day_group, on = "travel_dow"]
# Drop NA Days Not in the day_groups_dt
hh_day = hh_day[!is.na(day_group)]
# Reshape the Data into a Wide Format
hh_day[, h_dow := paste0("h_dow_", day_group)]
hh_day_wide = dcast(
hh_day,
hh_id + day_group ~ h_dow,
value.var = "h_dow",
fun.aggregate = function(x) as.integer(length(x) > 0)
)
# Join the Seed Table with the Household-Day Table ---------------------------
# Intentionally Using an Inner Join to Drop HH's with Only Non-Study Days
seed_day = merge(seed_flat, hh_day_wide, by = "hh_id")
# Create New Compound hh_id
seed_day[, hh_id_dow := paste0(hh_id, "_", day_group)]
# Create p_dow as h_dow * p_total
seed_day[,
(paste0("p_dow_", day_groups)) := lapply(.SD, function(x) x * p_total),
.SDcols = paste0("h_dow_", day_groups)
]
# Check that There are as Many Day Groups as Expected and no NAs
if (
any(is.na(seed_day$day_group)) |
(seed_day[, uniqueN(day_group)] != length(day_groups))
) {
stop("Warning: NAs in day_group. possible bad join.")
}15.5 Optional: Create Transit Targets
This code optionally calculates household-level transit trip targets if transit trips are specified in the settings. It identifies transit modes, aggregates transit trips by household and day group, adjusts for surveyable persons, reshapes the data to wide format per day group, and merges the results into the seed dataset. It ensures households with no transit trips are properly filled and applies a scaling factor for boardings when required. If transit trips are not specified, it simply passes through the existing seed data.
PSRC did not use a transit target.
if ("h_transit_trips" %in% names(settings$targets)) {
# Transit trips are effectively the average transit trips per days for each household.
transit_regex = paste(settings$transit_modes, collapse = "|")
transit_modes = value_labels[
variable == "mode_type" &
tolower(label) %like% transit_regex,
]
if (nrow(transit_modes) == 0) {
stop("No transit modes found in value_labels that match ", paste(settings$transit_modes, collapse = ", "))
}
# Extract the DOWs we're weighting for
weight_dow = unlist(get("weight_dow_groups", settings))
day_groups = get_day_groups(settings)
trips[day_groups, day_group := i.day_group, on = .(travel_dow)]
transit_by_day = trips[
mode_type %in% transit_modes$value &
hh_day_complete == 1 &
travel_dow %in% weight_dow, .N,
by = day_group
]
message("Transit modes: ", paste(transit_modes$label, collapse = ", "))
message("Transit trips by day group:")
print(transit_by_day)
# Aggregate transit trips to households
agg_fun = ifelse(settings$transit_target_type == "boardings", '.N', 'uniqueN(linked_trip_id)')
h_transit_trips = trips[
travel_dow %in% weight_dow &
mode_type %in% transit_modes$value &
hh_day_complete == 1,
.(total_transit_trips = eval(parse(text = agg_fun))),
by = .(hh_id, day_group)
]
# Number of complete study days
num_complete_dow = calc_complete_hhdays(households, days, settings)
num_complete_dow = num_complete_dow[day_group != "nonstudy_day",]
# Calculate average trips per day group
h_transit_trips[
num_complete_dow,
n_complete_dow := i.n_complete_hh_days,
on = .(hh_id, day_group)
]
h_transit_trips[is.na(n_complete_dow), n_complete_dow := 0]
h_transit_trips[, h_transit_trips := total_transit_trips / n_complete_dow]
# Get fraction of unsurveyable persons per household
# We do this to counteract the unrelated person adjustment in the trip weights
if (settings$unrelated_adjustment %in% c("person", "day")) {
h_transit_trips[households_raw, f_surveyable := i.num_people / i.num_surveyable, on = "hh_id"]
h_transit_trips[, h_transit_trips := h_transit_trips * f_surveyable]
}
# Reshape to wide on day_group
h_transit_trips[, label := paste0("h_transit_trips_", day_group)]
h_transit_trips_daygroup = dcast(
h_transit_trips,
hh_id + day_group ~ label,
value.var = "h_transit_trips",
fill = 0
)
# Assign to household seed
seed_final = merge(
seed_day,
h_transit_trips_daygroup,
by = c("hh_id", "day_group"),
all.x = TRUE
)
# fill in households with 0 transit trips
transit_cols = grep("^h_transit_trips_", names(seed_final), value = TRUE)
seed_final[, (transit_cols) := lapply(.SD, function(x) {
ifelse(!is.finite(x), 0, x)
}),
.SDcols = transit_cols
]
# Stop if all 0 transit trips, bad join
stopifnot(
seed_final[,
lapply(.SD, function(x) all(x == 0)), .SDcols = transit_cols
][, Reduce(`&`, .SD)] == FALSE
)
# Scale boardings per trip rate for rMove -----------------------------------
# ONLY TARGET BOARDINGS IF ALL TRIPS ARE SUFFICIENTLY UNLINKED
# Otherwise you should "link" the "unlinked" trips first and make linked trips the target.
if (get("transit_target_type", settings) == "boardings") {
bpt = get("transit_boardings_per_trip", settings)
seed_final[
diary_platform != 'rmove',
(transit_cols) := lapply(.SD, function(x) {x * bpt}),
.SDcols = transit_cols
]
}
} else {
seed_final = seed_day
}15.6 Optional: Aggregate Target Categories for Weighting
Sometimes certain targets and target categories can have small sample sizes in the survey data, which can impact the results of weighting. One option is to aggregate target categories for certain weighting zone groups or the entire region. This step will optionally aggregate target categories as specified in the project settings file.
# Append the Zone Group to the Matching Weighting Zone ID in the Seed Table
seed_adjusted <- seed_final[zone_groups_dt, zone_group := i.zone_group_label, on = "puma_id"]
zone_groups_dt[, zone_group := zone_group_label]
# Update the Targets as Specified in the Project Settings Configurations
seed_adjusted <- update_targets(seed_adjusted, settings, geo_cross_walk = as.data.table(zone_groups_dt), geom = "zone_group")15.7 Perform Seed Checks and Save Data
This code validates the final seed dataset by checking group totals and ensuring column names match the survey targets. It reorders columns for consistency, sorts the data by household ID, and saves the final seed data to disk as an RDS file.
# Check that Totals Add Up Properly --------------------------------------------
check_group_sums(seed_final, settings)
# Check that Column Names Match Survey Data -------------------------------
seed_names = str_subset(names(seed_final), "^(h|p)_")
target_names = str_subset(unique(c(names(target_czones), names(target_region))), "^(h|p)_")
not_in_targets = setdiff(seed_names, target_names)
not_in_seed = setdiff(target_names, seed_names)
stopifnot(length(c(not_in_targets, not_in_seed)) == 0)
seed_names = seed_names[seed_names %in% names(target_czones)]
setcolorder(target_puma, seed_names)
setcolorder(target_czones, seed_names)
setcolorder(target_region, seed_names)
# FINAL FULL OUTPUT FILES ------------------------------------------------------
seed_final = seed_final[order(hh_id)]
saveRDS(seed_final, get("seed_path", settings))15.8 Review Household- and Person-Level Targets by Weighting Zone Group: Unweighted Survey Counts
15.8.1 Table: Household-Level Target Counts for Unweighted Survey Data
Target Variable | Target Category | King County - Seattle | King County - Other | Kitsap County - Expanded | Pierce County | Snohomish County | Total |
|---|---|---|---|---|---|---|---|
Household size | 1 | 340 | 155 | 98 | 302 | 108 | 1,003 |
2 | 205 | 209 | 200 | 441 | 144 | 1,199 | |
3+ | 74 | 154 | 89 | 307 | 84 | 708 | |
Household income | $0-$24,999 | 58 | 33 | 20 | 85 | 21 | 217 |
$25,000-$49,999 | 77 | 51 | 51 | 180 | 46 | 405 | |
$50,000-$74,999 | 64 | 62 | 55 | 165 | 38 | 384 | |
$75,000-$99,999 | 78 | 54 | 60 | 148 | 53 | 393 | |
$100,000-$199,999 | 211 | 178 | 153 | 366 | 135 | 1,043 | |
$200,000+ | 131 | 140 | 48 | 106 | 43 | 468 | |
Number of workers | 0 | 155 | 173 | 197 | 438 | 127 | 1,090 |
1 | 298 | 183 | 122 | 372 | 121 | 1,096 | |
2+ | 166 | 162 | 68 | 240 | 88 | 724 | |
Vehicle sufficiency | None | 158 | 32 | 6 | 24 | 11 | 231 |
Insufficient | 150 | 103 | 52 | 161 | 52 | 518 | |
Sufficient | 311 | 383 | 329 | 865 | 273 | 2,161 | |
Number of children | 0 | 556 | 408 | 331 | 831 | 279 | 2,405 |
1+ | 63 | 110 | 56 | 219 | 57 | 505 | |
Total: Households | 619 | 518 | 387 | 1,050 | 336 | 2,910 | |
Total-Household DOW | Avg weekday | 619 | 518 | 387 | 1,050 | 336 | 2,910 |
15.8.2 Table: Person-Level Target Counts for Unweighted Survey Data
Target Variable | Target Category | King County - Seattle | King County - Other | Kitsap County - Expanded | Pierce County | Snohomish County | Total |
|---|---|---|---|---|---|---|---|
Gender | Male | 515 | 568 | 392 | 1,153 | 347 | 2,975 |
Female | 497 | 567 | 432 | 1,216 | 350 | 3,062 | |
Age | 0-4 | 32 | 43 | 25 | 85 | 28 | 213 |
5-15 | 56 | 114 | 61 | 269 | 57 | 557 | |
16-17 | 6 | 22 | 12 | 52 | 9 | 101 | |
18-24 | 53 | 66 | 28 | 106 | 27 | 280 | |
25-44 | 489 | 346 | 172 | 582 | 211 | 1,800 | |
45-64 | 219 | 288 | 195 | 615 | 181 | 1,498 | |
65+ | 157 | 256 | 331 | 660 | 184 | 1,588 | |
Employment | Non worker | 319 | 549 | 507 | 1,379 | 347 | 3,101 |
Part-time | 76 | 80 | 62 | 144 | 54 | 416 | |
Full-time | 617 | 506 | 255 | 846 | 296 | 2,520 | |
Commute mode | Bike/transit/walk | 68 | 18 | 45 | 38 | 169 | |
Work from home | 255 | 194 | 89 | 254 | 117 | 909 | |
Transit | 128 | 128 | |||||
Walk | 61 | 61 | |||||
Bike | 35 | 35 | |||||
Other (includes auto) | 206 | 310 | 203 | 660 | 191 | 1,570 | |
None | 327 | 563 | 514 | 1,410 | 351 | 3,165 | |
University student status | No | 944 | 1,077 | 783 | 2,275 | 667 | 5,746 |
Yes | 68 | 58 | 41 | 94 | 30 | 291 | |
Educational attainment | No college | 167 | 323 | 243 | 1,033 | 211 | 1,977 |
Some college | 845 | 812 | 581 | 1,336 | 486 | 4,060 | |
Race | Asian or Pacific Islander | 151 | 248 | 37 | 126 | 92 | 654 |
Black or African American | 43 | 25 | 13 | 93 | 10 | 184 | |
Other | 98 | 119 | 62 | 285 | 65 | 629 | |
White | 720 | 743 | 712 | 1,865 | 530 | 4,570 | |
Ethnicity | Not Hispanic | 940 | 1,069 | 790 | 2,152 | 651 | 5,602 |
Hispanic | 72 | 66 | 34 | 217 | 46 | 435 | |
Total: Persons | 1,012 | 1,135 | 824 | 2,369 | 697 | 6,037 | |
Total-Person DOW | Avg weekday | 1,012 | 1,135 | 824 | 2,369 | 697 | 6,037 |