census/util.R at master · schwartzgroup/census · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
# Utilities for working with Census data
#
# Contact: Edgar Castro <edgar_castro@g.harvard.edu>

library(data.table)
library(pbapply)
library(tidycensus)
library(totalcensus)
library(tools)

# Setup -------------------------------------------------------------------

# Path to save tidycensus results to
TIDYCENSUS_CACHE <- "cache/tidycensus"

# Path to save totalcensus results to
TOTALCENSUS_CACHE <- "cache/totalcensus"

# Path where the Census API key will be loaded from
API_KEY_FILE <- "api_key.txt"

# Geographies that do not require a state to be specified
GEOGRAPHIES_NO_STATE <- c(
  "us", "region", "state", "county", "division", "place",
  "combined statistical area", "congressional district", "urban area",
  "public use microdata area", "zip code tabulation area", "zcta"
)

MAIN_GEOGRAPHIES <- c(
  "state", "county", "tract", "block group", "block", "zip code tabulation area"
)

FIPS_DF <- read.csv("external/fips.csv")

# FIPS codes for the 50 states + DC - FIPS codes are sorted by first by states
# alphabetically, so we can subset the FIPS codes to those at or before Wyoming
# ("56")
STATES_DC_FIPS <- sprintf("%02d", subset(FIPS_DF, FIPS <= 56)[["FIPS"]])

if (file.exists(API_KEY_FILE)) {
  suppressMessages(census_api_key(readLines(API_KEY_FILE)))
} else {
  cat("Paste your API key here", file = API_KEY_FILE)
  stop(sprintf("Please paste your Census API key into %s", API_KEY_FILE))
}

pboptions(type = "timer")

Sys.setenv(PATH_TO_CENSUS = TOTALCENSUS_CACHE)

# Functions for Stage 1 ---------------------------------------------------

### Formula manipulation ----

lhs <- function(f) {
  return(f[[2]])
}

rhs <- function(f) {
  return(f[[3]])
}

rhs_variables <- function(f) {
  return(all.vars(rhs(f)))
}

## Driver templates ----

.get_variables <- function(year, dataset) {
  # Return a data.frame-like object of all variables available in the given
  # year-dataset combination with the following columns:
  #
  # * name: character of variable names (e.g. "B01001_001")
  # * label: character describing the variable (e.g. "Estimate!!Total:")
  # * concept: character describing the variable category (e.g. "SEX BY AGE")
  #
  # These do not necessarily have to be formatted as in the above examples; the
  # explain(...) function will handle formatting by stripping punctuation and
  # standardizing the letter case.
}

.get <- function(geography,
                 year,
                 dataset, # "acs5" / "sf1" / "sf3"
                 needed_variables,
                 states = STATES_DC_FIPS, # TODO: check if states= agrees with the last run
                 check_variables = TRUE,
                 ...
                 ) {
  # Return a data.frame-like object for all needed_variables in a given
  # geography-year-dataset combination, within the given states. If
  # check_variables is TRUE, first check needed_variables against
  # .get_variables(...) and raise an error if not all variables are available.
  #
  # The output should have a column of GEOIDs and all other columns should be
  # the needed_variables.
}

## tidycensus drivers ----

# Retrieve a list of available variables from the Census API via tidycensus
get_tidycensus_variables_cached <- function(year, dataset) {
  cache_directory <- file.path(TIDYCENSUS_CACHE, dataset, year)
  dir.create(cache_directory, showWarnings = FALSE, recursive = TRUE)

  variables_file <- file.path(cache_directory, "variables.csv.gz")
  if (file.exists(variables_file)) {
    return(fread(variables_file))
  } else {
    result <- load_variables(year, dataset)
    fwrite(result, variables_file)
    return(result)
  }
}

# Retrieve data from the Census API via tidycensus
get_tidycensus_cached <- function(geography,
                                  year,
                                  dataset, # "acs5" / "sf1" / "sf3"
                                  needed_variables,
                                  states = STATES_DC_FIPS, # TODO: check if states= agrees with the last run
                                  check_variables = TRUE,
                                  ...
                                  ) {
  cache_directory <- file.path(TIDYCENSUS_CACHE, dataset, year, geography)
  dir.create(cache_directory, showWarnings = FALSE, recursive = TRUE)

  if (check_variables) {
    message("Checking variables")
    all_variables <- get_tidycensus_variables_cached(year, dataset)[["name"]]
    missing_variables <- setdiff(needed_variables, all_variables)
    if (length(missing_variables > 0)) {
      stop(sprintf(
        "Unavailable variables: %s",
        paste(missing_variables, collapse = ", ")
      ))
    }
  }

  # List of cached variables, minus GEOID
  cached_variables <- intersect(
    needed_variables,
    setdiff(
      gsub(".csv.gz", "", list.files(cache_directory)),
      "GEOID"
    )
  )

  # Variables that need to be fetched
  variables_to_fetch <- setdiff(needed_variables, cached_variables)

  # Function used for fetching
  if (dataset %in% c("acs5", "acs3", "acs1")) {
    fetch_function <- function(...) {
      return(get_acs(..., survey = dataset))
    }
    value_column <- "estimate"
  } else {
    fetch_function <- get_decennial
    fetch_function <- function(...) {
      return(get_decennial(..., sumfile = dataset))
    }
    value_column <- "value"
  }

  message(sprintf(
    "Existing variables: %s/%s (%0.2f%%)\nNeed to fetch %s",
    length(cached_variables),
    length(needed_variables),
    length(cached_variables) / length(needed_variables) * 2,
    length(variables_to_fetch)
  ))

  # Retrieve cached data
  if (length(cached_variables) > 0) {
    message("Loading cached data")
    cached_data <- pblapply(
      c("GEOID", cached_variables),
      function(variable) {
        # Warnings arise here when "GEOID" is not a column that is present
        suppressWarnings(fread(
          file.path(cache_directory, sprintf("%s.csv.gz", variable)),
          colClasses = list(character = "GEOID")
        ))
      }
    )
    message("Binding columns")
    cached_data <- do.call(cbind, cached_data)
  }

  # Retrieve new data
  if (length(variables_to_fetch) > 0) {
    message("Fetching new data")

    if (geography %in% GEOGRAPHIES_NO_STATE) {
        new_data <- as.data.table(suppressMessages(fetch_function(
            geography, variables_to_fetch, year = year, ...
        )))
      if (geography %in% c("zip code tabulation area", "zcta")) {
        new_data[, GEOID := tstrsplit(NAME, " ")[[2]]]
      }
      new_data <- dcast(
        new_data,
        GEOID ~ variable,
        value.var = value_column
      )
    } else {
      new_data <- dcast(
        rbindlist(pblapply(
          states,
          function(state) {
            suppressMessages(fetch_function(
              geography, variables_to_fetch, year = year, state = state
            ))
          }
        )),
        GEOID ~ variable,
        value.var = value_column
      )
    }

    # Cache new data
    message("Caching new data")
    invisible(pblapply(
      names(new_data),
      function(variable) {
        cache_file <- file.path(cache_directory, sprintf("%s.csv.gz", variable))
        if (!file.exists(cache_file)) {
          fwrite(subset(new_data, select = variable), cache_file)
        }
      }
    ))

    if (length(cached_variables) > 0) {
      # Both new data and cached data: merge
      data <- new_data[cached_data, on = list(GEOID)]
    } else {
      # Only new data
      data <- new_data
    }
  } else {
    # Only cached data
    data <- cached_data
  }

  return(data)
}

## totalcensus drivers ----

get_totalcensus_variables <- function(year, dataset) {
  if (dataset %in% c("sf1", "sf3")) {
    warning(sprintf(
      "Translating dataset \"%s\" to \"dec\" for totalcensus compatibility",
      dataset
    ))
    dataset <- "dec"
  }
  result <- search_tablecontents(dataset, 2009, view = FALSE)
  available <- complete.cases(result)
  return(result[available
                ][, list(name = reference,
                         label = table_content,
                         concept = table_name)])
}

get_totalcensus <- function(geography,
                            year,
                            dataset, # "acs5" / "sf1" / "sf3"
                            needed_variables,
                            states = states_DC, # TODO: check if states= agrees with the last run
                            check_variables = TRUE,
                            ...
                            ) {
  if (dataset %in% c("sf1", "sf3")) {
    warning(sprintf(
      "Translating dataset \"%s\" to \"dec\" for totalcensus compatibility",
      dataset
    ))
    dataset <- "dec"
  }

  if (dataset == "acs5") {
    read_function <- read_acs5year
  } else if (dataset == "acs1") {
    read_function <- read_acs1year
  } else if (dataset == "dec") {
    read_function <- read_decennial
  } else {
    stop(sprintf("Dataset %s not supported", dataset))
  }

  if (geography %in% c("zip code tabulation area", "zcta")) {
    geography <- "860"
    states <- "US"
  }

  if (check_variables) {
    message("Checking variables")
    all_variables <- get_totalcensus_variables(year, dataset)[["name"]]
    missing_variables <- setdiff(needed_variables, all_variables)
    if (length(missing_variables > 0)) {
      stop(sprintf(
        "Unavailable variables: %s",
        paste(missing_variables, collapse = ", ")
      ))
    }
  }

  result <- read_function(
    year = year,
    states = states,
    table_contents = needed_variables,
    summary_level = geography
  )

  result[["GEOID"]] <- gsub("^[0-9]+US", "", result[["GEOID"]])

  return(subset(result, select = c("GEOID", needed_variables)))
}

## User-facing functions ----

# census_fetch_function should be a function returning a data.frame-like object
# with the GEOIDs in the first column and all other specified variables in other
# columns
get_census <- function(geography,
                       year,
                       dataset,
                       formulas,
                       census_fetch_function = get_tidycensus_cached,
                       states = STATES_DC_FIPS, ...) {
  needed_variables <- unique(unlist(lapply(formulas, rhs_variables)))

  data <- census_fetch_function(geography, year, dataset, needed_variables, ...)

  result <- data.table(
    year = year,
    dataset = dataset,
    GEOID = data[["GEOID"]]
  )

  message("Evaluating formulas")
  invisible(pblapply(
    1:length(formulas),
    function(i) {
      f <- formulas[[i]]
      variable <- lhs(f)
      values <- with(data, eval(rhs(f)))
      result[[variable]] <<- values
    }
  ))

  return(result)
}

# Fill in variables from a formula using definitions from the Census
explain <- function(year,
                    dataset,
                    formula,
                    # width = NULL,
                    delimiter = "$",
                    get_variables = get_tidycensus_variables_cached) {
  variables <- get_variables(year, dataset)
  # result <- format(formula, width = width)
  result <- deparse1(formula)
  sapply(
    rhs_variables(formula),
    function(variable) {
      definition <- subset(variables, name == variable)
      label <- sprintf(
        "%s%s%s",
        toTitleCase(tolower(definition[["concept"]])),
        delimiter,
        definition[["label"]]
      )
      label <- gsub("Estimate!!", "", label)
      label <- gsub(":$", "", label)
      label <- gsub(":*!!", delimiter, label)
      label <- sprintf("[%s]", label)
      result <<- gsub(variable, label, result, fixed = TRUE)
    }
  )
  return(result)
}

## Data retrieval ----

download_zip <- function(url, output_directory, temp_file = "temp.zip") {
  # message(sprintf("Downloading %s to %s", url, output_directory))
  if (file.exists(temp_file)) {
    file.remove(temp_file)
  }
  download.file(url, temp_file)
  tryCatch(
    system2("unzip", args = c("-d", output_directory, temp_file)),
    error = function(error) {
      message(error)
      unzip(temp_file, exdir = output_directory)
    }
  )
  file.remove(temp_file)
}