-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcleaning.R
More file actions
72 lines (55 loc) · 2.26 KB
/
cleaning.R
File metadata and controls
72 lines (55 loc) · 2.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
library(tidyverse)
library(dplyr)
library(janitor)
clean_acs_data <- function(df) {
# convert acs data to wide format
acs_wide <- df %>%
clean_names() %>%
select(-moe) %>%
pivot_wider(names_from = variable,
values_from = estimate)
# generate features from acs wide data
acs_clean <- acs_wide %>%
mutate(
percent_hs_grad = high_school_grads / total_pop,
percent_college_grad = has_bachelors / total_pop,
percent_below_100_poverty = below_100_percent_poverty / total_pop,
percent_male_26_34_uninsured = uninsured_26_34_male / total_male,
percent_female_26_34_uninsured = uninsured_26_34_female / total_female,
log_median_income = log(median_income),
# replace first character of geoids that start with 0 with empty string
geoid = as.integer(ifelse(substr(geoid, 1, 1) == 0, sub("^.", "", geoid), geoid))) %>%
# multiplying percentages by 100
mutate_at(vars(matches('percent')), function(x) {x * 100}) %>%
select(c(geoid,
name,
percent_hs_grad,
percent_college_grad,
percent_below_100_poverty,
percent_male_26_34_uninsured,
percent_female_26_34_uninsured,
log_median_income)
)
return(acs_clean)
}
clean_prescription_data <- function(df) {
# correct FIPS codes to match acs geoid
prescriptions_clean <- df %>%
clean_names() %>%
select(geoid = state_county_fips_code,
prescriptions_per_100 = x2017)
return(prescriptions_clean)
}
clean_overdose_data <- function(df) {
# correct FIPS codes
overdose_clean <- df %>%
clean_names() %>%
mutate(crude_rate = suppressWarnings(as.numeric(as.character(crude_rate))),
deaths = suppressWarnings(as.numeric(as.character(deaths))),
log_population = log(suppressWarnings(as.numeric(as.character(population))))) %>%
select(geoid = county_code,
crude_death_rate = crude_rate,
deaths,
population)
return(overdose_clean)
}