Skip to contents

Prepare datasets to make summaries

Usage

hts_prep_variable(
  summarize_var = NULL,
  summarize_by = NULL,
  variables_dt = variable_list,
  data = list(hh = hh, person = person, day = day, trip = trip, vehicle = vehicle),
  id_cols = c("hh_id", "person_id", "day_id", "trip_id", "vehicle_id"),
  weighted = TRUE,
  wt_cols = c("hh_weight", "person_weight", "day_weight", "trip_weight", "hh_weight"),
  remove_outliers = TRUE,
  threshold = 0.975,
  remove_missing = TRUE,
  missing_values = c("Missing Response", "995"),
  not_imputable = -1,
  strataname = NULL
)

Arguments

summarize_var

Name of the variable to summarize. Default is NULL

summarize_by

Name of the variable to summarize the summarize_var by. Default is NULL.

variables_dt

List of variable locations and descriptions in data.table format.

data

List of household, person, vehicle, day, and trip tables in data.table format.

id_cols

name of unique identifier for each table in hts_data

weighted

Whether the data is weighted. Default is TRUE.

wt_cols

weight name for each table in hts_data

remove_outliers

Whether to remove outliers for numeric variable. Default is TRUE.

threshold

Threshold to define outliers. Default is 0.975.

remove_missing

Whether to remove missing values from the summary. Default is TRUE.

missing_values

Missing values to remove. Default is 995.

not_imputable

Value representing 'Not imputable' to remove. Default is -1.

strataname

Name of strata name to bring in. Default is NULL.

Value

List containing the categorical and numeric datasets of the summary variables and key columns, and either whether the summarize variable is shared or a breakdown of outliers, depending on if the summarize variable is categorical or numeric.

Examples


require(data.table)
require(stringr)
hts_prep_variable(
  summarize_var = "age",
  variables_dt = variable_list,
  data = list(
    "hh" = hh,
    "person" = person,
    "day" = day,
    "trip" = trip,
    "vehicle" = vehicle
  )
)
#> $cat
#>       hh_id person_id age person_weight
#>    1:   356         1  10           229
#>    2:   724         2  11           128
#>    3:   681         3   7           888
#>    4:   114         4   2           350
#>    5:   165         5  10           825
#>   ---                                  
#> 2043:   931      2043   7           116
#> 2044:   667      2044   1           122
#> 2045:   543      2045   3           494
#> 2046:   749      2046  11           874
#> 2047:   364      2047  11           393
#> 
#> $num
#> NULL
#> 
hts_prep_variable(
  summarize_var = "speed_mph",
  summarize_by = "age",
  variables_dt = variable_list,
  data = list(
    "hh" = hh,
    "person" = person,
    "day" = day,
    "trip" = trip,
    "vehicle" = vehicle
  )
)
#> Warning: 378 outliers were removed based on the threshold of 0.975.
#> $cat
#>        hh_id person_id day_id trip_id person_weight trip_weight speed_mph age
#>     1:     2       425    388   11340           130         258      1-10  12
#>     2:     2       425    388    9915           130          57     19-28  12
#>     3:     2       425   1320    4947           130         233     10-19  12
#>     4:     2       892   1559    4450           715         845 1 or less   3
#>     5:     2       892   1559    7943           715         685     10-19   3
#>    ---                                                                       
#> 14718:  1000       352   1663    2279           883         177     19-28  10
#> 14719:  1000       352   3389    7926           883         819     10-19  10
#> 14720:  1000       352   3389    1639           883          25     10-19  10
#> 14721:  1000       352   3389   13926           883         216      1-10  10
#> 14722:  1000       352   3389   11951           883         700      1-10  10
#> 
#> $num
#>        hh_id person_id day_id trip_id person_weight trip_weight speed_mph age
#>     1:     2       425    388   11340           130         258  2.981870  12
#>     2:     2       425    388    9915           130          57 20.761805  12
#>     3:     2       425   1320    4947           130         233 16.885215  12
#>     4:     2       892   1559    4450           715         845  0.610109   3
#>     5:     2       892   1559    7943           715         685 10.902785   3
#>    ---                                                                       
#> 14718:  1000       352   1663    2279           883         177 26.002610  10
#> 14719:  1000       352   3389    7926           883         819 10.201849  10
#> 14720:  1000       352   3389    1639           883          25 13.915420  10
#> 14721:  1000       352   3389   13926           883         216  3.337872  10
#> 14722:  1000       352   3389   11951           883         700  2.583252  10
#> 
#> $outliers
#>    threshold num_removed min_outlier max_outlier
#> 1:     0.975         378    112.9918    228233.1
#> 


hts_prep_variable(
  summarize_var = "employment",
  summarize_by = c("age", "race"),
  variables_dt = variable_list,
  data = list(
    "hh" = hh,
    "person" = person,
    "day" = day,
    "trip" = trip,
    "vehicle" = vehicle
  )
)
#> $cat
#>       hh_id person_id person_weight employment age                 race
#>    1:     2       217            92          3   3          Two or more
#>    2:     2       425           130          5  12          Two or more
#>    3:     2       892           715          1   3 Prefer not to answer
#>    4:     3       417           987          8  10 Prefer not to answer
#>    5:     3      1671           644          5   9          Two or more
#>   ---                                                                  
#> 1749:   997      1724           452          2  11 Prefer not to answer
#> 1750:   998       409           406          5   7          Two or more
#> 1751:   998       817           664          6   4 Prefer not to answer
#> 1752:   999      1305           836          6  10          Two or more
#> 1753:  1000       352           883          1  10          Two or more
#> 
#> $num
#> NULL
#>