Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 26 additions & 6 deletions R/GVA_by_sector.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,19 @@
GVA_by_sector <- function(
combine_GVA_long = NULL,
GVA = NULL,
tourism = NULL
tourism = NULL,
charities = NULL
) {

check_class(combine_GVA_long)
check_class(GVA)
check_class(tourism)


GVA_by_sector <- dplyr::group_by(combine_GVA_long, year, sector) %>%
summarise(GVA = sum(BB16_GVA)) %>%

#append total UK GVA
#append total UK GVA by year
bind_rows(
filter(GVA, SIC == "year_total") %>%
mutate(sector = "UK") %>%
Expand All @@ -51,22 +53,39 @@ GVA_by_sector <- function(
bind_rows(
mutate(tourism, sector = "tourism") %>%
select(year, sector, GVA)
) %>%

#append charitites data
bind_rows(
mutate(charities, sector = "charities") %>%
select(year, sector, GVA)
)

#add overlap info from tourism in order to calculate GVA for sector=all_dcms
tourism_all_sectors <- mutate(tourism, sector = "all_dcms") %>%
select(year, sector, overlap)

#add overlap info from tourism in order to calculate GVA for sector=all_dcms
charities_all_sectors <- mutate(charities, sector = "all_dcms") %>%
select(year, sector, overlap)


GVA_by_sector <-
left_join(GVA_by_sector, tourism_all_sectors, by = c("year", "sector")) %>%
ungroup() %>%
mutate(GVA = ifelse(!is.na(overlap), overlap + GVA, GVA)) %>%
select(-overlap) %>%
select(-overlap)

GVA_by_sector <-
left_join(GVA_by_sector, charities_all_sectors, by = c("year", "sector")) %>%
ungroup() %>%
mutate(GVA = ifelse(!is.na(overlap), overlap + GVA, GVA)) %>%
select(-overlap)

#final clean up
GVA_by_sector <- GVA_by_sector %>%
filter(year %in% 2010:max(attr(combine_GVA_long, "years"))) %>%
mutate(GVA = round(GVA, 2),
sector = factor(sector),
mutate(sector = factor(sector),
year = as.integer(year)) %>%
select(sector, year, GVA) %>%
arrange(year, sector)
Expand All @@ -77,8 +96,9 @@ GVA_by_sector <- function(
# )

sectors_set <- c(
"charities" = "Civil Society (Non-market charities)",
"creative" = "Creative Industries",
"culture" = "Cultural Sector",
"culture" = "Cultural Sector",
"digital" = "Digital Sector",
"gambling" = "Gambling",
"sport" = "Sport",
Expand Down
80 changes: 80 additions & 0 deletions R/GVA_by_sub_sector.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#' @title Calculate GVA by sector
#'
#' @description Combines datasets exracted from the underlying spreadsheet using
#' the \code{extract_XXX} functions.
#'
#' NOTE: THIS FUNCTION RELIES ON DATA WHICH ARE CLASSIFIED AS
#' OFFICIAL-SENSITIVE. THE OUTPUT OF THIS FUNCTION IS AGGREGATED, AND
#' PUBLICALLY AVAILABLE IN THE FINAL STATISTICAL RELEASE, HOWEVER CARE MUST BE
#' EXERCISED WHEN CREATING A PIPELINE INCLUDING THIS FUNCTION. IT IS HIGHLY
#' ADVISEABLE TO ENSURE THAT THE DATA WHICH ARE CREATED BY THE \code{extract_}
#' FUNCTIONS ARE NOT STORED IN A FOLDER WHICH IS A GITHUB REPOSITORY TO
#' MITIGATE AGAINST ACCIDENTAL COMMITTING OF OFFICIAL DATA TO GITHUB. TOOLS TO
#' FURTHER HELP MITIGATE THIS RISK ARE AVAILABLE AT
#' https://github.com/ukgovdatascience/dotfiles.
#'
#' @details The best way to understand what happens when you run this function
#' is to view the source code.
#'
#'
#' @param combine_GVA_long data output from \code{eesectors::combine_GVA_long()}.
#' @param GVA ABS data as extracted by \code{eesectors::extract_GVA_data()}.
#'
#' @export
#'
#' @import dplyr


GVA_by_sub_sector <- function(
combine_GVA_long = combine_GVA_long,
GVA = GVA,
sub_sector = NULL
) {

check_class(combine_GVA_long)
check_class(GVA)

GVA_by_sub_sector <- combine_GVA_long %>%
filter(sector == sub_sector) %>%
group_by(year, sub_sector_categories) %>%
summarise(GVA = sum(BB16_GVA)) %>%
ungroup() %>%

#append total sector GVA by year
bind_rows(
combine_GVA_long %>%
filter(sector == sub_sector) %>%
group_by(year) %>%
summarise(GVA = sum(BB16_GVA)) %>%
mutate(sub_sector_categories =
paste0(toupper(substr(sub_sector, 1, 1)),
substr(sub_sector, 2, nchar(sub_sector)))) %>%
select(year, sub_sector_categories, GVA)
) %>%

#append total UK GVA by year
bind_rows(
filter(GVA, SIC == "year_total") %>%
mutate(sub_sector_categories = "UK") %>%
select(year, sub_sector_categories, GVA)
) %>%

#final clean up
filter(year %in% 2010:max(attr(combine_GVA_long, "years"))) %>%
mutate(#GVA = round(GVA, 2),
#sub_sector_categories = factor(sub_sector_categories),
year = as.integer(year)) %>%
select(sub_sector_categories, year, GVA) %>%
arrange(year, sub_sector_categories)

#check "tbl_df" "tbl" "data.frame" is class(gva_by sub sector)
# structure(
# GVA_by_sector,
# class = c("GVA_by_sector", class(combine_GVA_long)[-1])
# )

structure(
GVA_by_sub_sector,
years = sort(unique(GVA_by_sub_sector$year)),
class = c("GVA_by_sector", class(GVA_by_sub_sector)))
}
32 changes: 22 additions & 10 deletions R/combine_GVA_long.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,34 +34,46 @@ combine_GVA_long <- function(
ABS = NULL,
GVA = NULL,
SIC91 = NULL,
DCMS_sectors = eesectors::DCMS_sectors) {
DCMS_sectors = eesectors::sic_mappings) {

check_class(ABS)
check_class(GVA)
check_class(SIC91)

abs_year <- max(attr(ABS, "years"))
#Annual business survey, duplicate 2014 data for 2015 and
#then duplicate non SIC91 then add SIC 91 with sales data
ABS_2015 <- filter(ABS, year == abs_year) %>%
mutate(year = abs_year + 1) %>%

#this line makes no sense to me - we are just duplicated rows we already
#have so surely it is redundant??
bind_rows(filter(ABS, !SIC %in% unique(SIC91$SIC))) %>%
# #Annual business survey, duplicate 2014 data for 2015 and
# #then duplicate non SIC91 then add SIC 91 with sales data
# ABS_2015 <- filter(ABS, year == abs_year) %>%
# mutate(year = abs_year + 1) %>%
#
# #this line makes no sense to me - we are just duplicated rows we already
# #have so surely it is redundant??
# bind_rows(filter(ABS, !SIC %in% unique(SIC91$SIC))) %>%
#
# #simply appending SIC sales data which supplements the ABS for SIC 91
# bind_rows(SIC91)

ABS_2015 <- ABS %>%
filter(!SIC %in% unique(SIC91$SIC)) %>%

#simply appending SIC sales data which supplements the ABS for SIC 91
bind_rows(SIC91)

ABS_2015 <-
bind_rows(
ABS_2015,
filter(ABS_2015, year == abs_year) %>%
mutate(year = abs_year + 1))

# keep cases from ABS which have integer SIC - which is just a higher level SIC
denom <- filter(ABS_2015, SIC %in% unique(eesectors::DCMS_sectors$SIC2)) %>%
denom <- filter(ABS_2015, SIC %in% unique(eesectors::sic_mappings$SIC2)) %>%
select(year, ABS, SIC) %>%
rename(ABS_2digit_GVA = ABS, SIC2 = SIC)


#add ABS to DCMS sectors
GVA_sectors <- left_join(eesectors::DCMS_sectors, ABS_2015, by = c('SIC')) %>%
GVA_sectors <- left_join(eesectors::sic_mappings, ABS_2015, by = c('SIC')) %>%
rename(ABS_ind_GVA = ABS) %>%
#drop cases where SIC is not in that sector - should do when building DCMS_sectors
filter(present == TRUE) %>%
Expand Down
3 changes: 3 additions & 0 deletions R/extract_ABS_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ extract_ABS_data <- function(
} else
stop("Invalid format argument")

#remove duplicate SIC 92
df <- df[-149, ]

#determine most recent year of data
years <- suppressWarnings(as.numeric(colnames(df)))
years <- min(years[!is.na(years)]):max(years[!is.na(years)])
Expand Down
17 changes: 5 additions & 12 deletions R/extract_GVA_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ extract_GVA_data <- function(
col_types = rep("numeric",dend[2]))
)

# subset data to the table we want
data2 <- data.frame(t(data[dstart[1]:dend[1], (dstart[2] + 1):dend[2]]))
colnames(data2) <- unlist(data[dstart[1]:dend[1], dstart[2]])

Expand All @@ -71,9 +72,10 @@ extract_GVA_data <- function(

SIC <- as.character(t(SIC[cnames, (dstart[2] + 1):dend[2]]))
gva <- cbind(SIC, data2, stringsAsFactors = FALSE)
gva[1, 1] <- "year_total"

gva <- gva %>%
filter(SIC %in% eesectors::DCMS_sectors$SIC2)
filter(SIC %in% eesectors::DCMS_sectors$SIC2 | SIC == "year_total")


#determine most recent year of data
Expand All @@ -83,8 +85,8 @@ extract_GVA_data <- function(
#check for missing columns
na_col_test(gva)

#check number of SIC codes in dataset
if (nrow(gva) != length(unique((eesectors::DCMS_sectors$SIC2))))
#check number of SIC codes in dataset (+1 for year_total)
if (nrow(gva) != length(unique((eesectors::DCMS_sectors$SIC2))) + 1)
stop(
paste0(
"GVA data has rows for ",
Expand All @@ -99,15 +101,6 @@ extract_GVA_data <- function(
mutate(year = as.integer(year)) %>%
as.tbl()

# add total of SICs for each year
totals <- gva2 %>%
group_by(year) %>%
summarise(GVA = sum(GVA)) %>%
mutate(SIC = "year_total") %>%
select(SIC, year, GVA)

gva2 <- rbind(gva2, totals)

#check columns names
if(
!identical(
Expand Down
94 changes: 94 additions & 0 deletions R/extract_charities_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#' @title extract toursim Data ONS working file spreadsheet
#'
#' @description The data which underlies the Economic Sectors for DCMS sectors
#' data is typically provided to DCMS as a spreadsheet from the Office for
#' National Statistics. This function extracts the tourism data from that
#' spreadsheet, and saves it to .Rds format. These data are provided as the
#' usual tourism values in the GVA dataset cannot be used.
#'
#' IT IS HIGHLY ADVISEABLE TO ENSURE THAT THE DATA WHICH ARE CREATED BY THIS
#' FUNCTION ARE NOT STORED IN A FOLDER WHICH IS A GITHUB REPOSITORY TO
#' MITIGATE AGAINST ACCIDENTAL COMMITTING OF OFFICIAL DATA TO GITHUB. TOOLS TO
#' FURTHER HELP MITIGATE THIS RISK ARE AVAILABLE AT
#' https://github.com/ukgovdatascience/dotfiles.
#'
#' @details The best way to understand what happens when you run this function
#' is to look at the source code, which is available at
#' \url{https://github.com/ukgovdatascience/eesectors/blob/master/R/}. The
#' code is relatively transparent and well documented. A brief explanation of
#' what the function does here:
#'
#' 1. The function calls \code{readxl::read_excel} to load the appropriate
#' page from the underlying spreadsheet.
#'
#' 2. Sanitise the \code{colnames} using a user-supplied vector in
#' \code{new_colnames}. If there are no changes to the 2016 spreadhseet, in
#' future years, then the default vector should work in future years. If there
#' have been changes, this is likely to be a cause of errors.
#'
#' 3. Empty rows (containing all \code{NA}s) are removed.
#'
#' 4. The data are saved out to an R serialisation object
#' \code{OFFICIAL_tourism.Rds} in the specified folder.
#'
#' @param x Location of the input spreadsheet file. Named something like
#' "working_file_dcms_VXX.xlsm".
#' @param sheet_name The name of the spreadsheet in which the data are stored.
#' Defaults to \code{New ABS Data}.
#' @param col_names character vector used to rename the column names from the
#' imported spreadsheet. Defaults to
#' \code{c('year','gva','total','perc','overlap')}.
#'
#' @return The function returns nothing, but saves the extracted dataset to
#' \code{file.path(output_path, 'OFFICIAL_ABS.Rds')}. This is an R data
#' object, which retains the column types which would be lost if converted to
#' a flat format like CSV.
#'
#' @examples
#'
#' \dontrun{
#' library(eesectors)
#' extract_toursim_data(
#' x = 'OFFICIAL_working_file_dcms_V13.xlsm',
#' sheet_name = 'Tourism'
#' )
#' }
#'
#' @export

extract_charities_data <- function(
x,
sheet_name = 'Charities',
col_names = c('year','GVA','total','perc','overlap')
) {

# Load the data using readr. Note that additional arguments (e.g. skip, and
# colnames) can be passed to read_excel using the ... operator

df <- readxl::read_excel(path = x, sheet = sheet_name, col_names = TRUE, skip = 1)

# Standardise the column names.

colnames(df) <- col_names

# Remove the extraneous rows, byt first checking whether they are all NA.
df2 <- df[1:7, 1:5]

# Return the data to the console.

message(
'################################# WARNING #################################
The data produced by this function may contain OFFICIAL information.
Ensure that the data are not committed to a github repository.
Tools to prevent the accidental committing of data are available at:
https://github.com/ukgovdatascience/dotfiles. Pay special attention
to .Rdata files, and .Rhistory files produced by Rstudio. Best practice
is to disable the creation of such files.'
)

structure(
df2,
class = c("charities", class(df2))
)

}
2 changes: 2 additions & 0 deletions R/extract_tourism_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ extract_tourism_data <- function(

x <- x[mask,]

x <- x[, 1:5]

# Return the data to the console.

message(
Expand Down
Loading