-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstep_0_source_me_data.R
More file actions
113 lines (109 loc) · 6.55 KB
/
step_0_source_me_data.R
File metadata and controls
113 lines (109 loc) · 6.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Copyright 2022 Province of British Columbia
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
#This script downloads the MPI data from https://www2.gov.bc.ca/gov/content/employment-business/economic-development/industry/bc-major-projects-inventory/recent-reports
# and does some pre-processing.
#libraries--------
if(!"tidyverse" %in% names(sessionInfo()$otherPkgs)) library(tidyverse)
if(!"lubridate" %in% names(sessionInfo()$otherPkgs)) library(lubridate) #years and months not exported objects
#functions--------
keep_columns <- function(df){
df%>%
select_if(names(.) %in% c("project_id", "proj_id",
"project_name", "proj_nm",
"project_description", "description",
"estimated_cost", "est_cost",
"project_type", "proj_typ",
"region",
"municipality", "muni_nm",
"developer", "dvlpr_nm",
"project_status", "status",
"project_stage", "stage",
"project_category_name", "proj_cat",
"construction_type", "proj_cons_typ",
"construction_subtype", "proj_con_subtyp",
"public_funding_ind","public",
# "provinvial_funding","province",
# "federal_funding","federal",
# "municipal_funding","municipal",
# "other_public_funding","other_province",
"green_building_ind","green_building",
"clean_energy_ind","clean_energy",
"first_nation_ind", "indigenous_ind","fn",
"first_entry_date", "entry_dt",
"telephone","fin_by",
"last_update", "last_up_dt"))
}
#the program----------------
if (!file.exists(here::here("raw_data"))) dir.create(here::here("raw_data"))
mpi_url_to_scrape <- "https://www2.gov.bc.ca/gov/content/employment-business/economic-development/industry/bc-major-projects-inventory/recent-reports"
mpi_scraped <- rvest::read_html(mpi_url_to_scrape)
mpi_links <- rvest::html_attr(rvest::html_nodes(mpi_scraped, "a"), "href") #all the links
mpi_links <- mpi_links[mpi_links%>%startsWith("/assets/") & mpi_links%>%endsWith(".xlsx")]%>% #stubs of the links we want.
na.omit()
mpi_links <- paste0("https://www2.gov.bc.ca", mpi_links) #paste the head onto the stubs
mpi_files <- paste0("mpi_dl", 1:length(mpi_links), ".xlsx")
mapply(download.file, mpi_links, here::here("raw_data", mpi_files)) #downloads all the mpi files into folder raw_data
mpi_all_sheets <- sapply(here::here("raw_data", mpi_files), readxl::excel_sheets) #gets all the sheets
sheet_starts_with_mpi <- lapply(mpi_all_sheets, function(x) x[startsWith(x, "mpi")])%>%
unlist(use.names = FALSE)
sheet_starts_with_Full <- lapply(mpi_all_sheets, function(x) x[startsWith(x, "Full")])%>%
unlist(use.names = FALSE)
#file structure changed significantly in 2016... only use recent files--------
short_sheets <- c(sheet_starts_with_mpi, sheet_starts_with_Full)
short_files <-mpi_files[1:length(short_sheets)]
short_nested <- tibble(file = here::here("raw_data", short_files), sheet = short_sheets)%>%
mutate(data = map2(file, sheet, readxl::read_excel),
data = map(data, janitor::clean_names),
data = map(data, keep_columns)
)
mpi_shortraw <- data.table::rbindlist(short_nested$data, use.names = FALSE)%>%
as_tibble()%>%
mutate(source = mpi_url_to_scrape,
published_dates = last_update,
project_category_name = fct_collapse(project_category_name,
`Residential & Commercial`= c("Residential/Commercial",
"Residential Commercial")))%>%
mutate(days_in_inventory = as.numeric(difftime(last_update, first_entry_date, units = "days")),
quarter = tsibble::yearquarter(published_dates))%>%
mutate(last_update=as.Date(last_update),
project_status=fct_relevel(project_status, "Proposed", "On hold", "Construction started", "Completed"),
project_status=factor(project_status, ordered = TRUE),
project_stage=fct_relevel(project_stage, "Preliminary/Feasibility", "Consultation/Approvals","Tender/Preconstruction","Permitting"),
project_stage=factor(project_stage, ordered = TRUE),
public_funding_ind=factor(public_funding_ind, labels = c("no", "yes")),
green_building_ind=factor(green_building_ind, labels = c("no", "yes")),
clean_energy_ind=factor(clean_energy_ind, labels = c("no", "yes")),
indigenous_ind=factor(indigenous_ind, labels = c("no", "yes"))
)%>%
rename(public_funding = public_funding_ind,
green_building = green_building_ind,
clean_energy = clean_energy_ind,
indigenous = indigenous_ind,
project_category = project_category_name)%>%
mutate(municipality = word(municipality, 1, sep=","),
municipality = word(municipality, 1, sep="/"),
municipality = word(municipality, 1, sep="-"),
municipality = word(municipality, 1, sep="And"),
municipality = word(municipality, 1, sep="To"),
municipality = word(municipality, 1, sep="Area"),
municipality = word(municipality, 1, sep="area"),
municipality = word(municipality, 1, sep="region"),
municipality = word(municipality, 1, sep="south of"),
municipality = word(municipality, 1, sep=","),
municipality = trimws(municipality),
municipality = str_replace_all(municipality, "Tri","Coquitlam")
)
lat_lon <- read_csv(here::here("processed_data", "mpi_locations.csv"))%>%
mutate(municipality = word(place, 1, sep=","))%>% #gets rid of province and country.
select(-place)
short <- left_join(mpi_shortraw, lat_lon, by="municipality", multiple = "all")
saveRDS(short, here::here("processed_data", "mpi_shortraw.rds"))