DCMSstats · maxwell8888 · Nov 20, 2017 · Nov 27, 2017
diff --git a/R/GVA_by_sector.R b/R/GVA_by_sector.R
@@ -30,17 +30,19 @@
 GVA_by_sector <- function(
   combine_GVA_long = NULL,
   GVA = NULL,
-  tourism = NULL
+  tourism = NULL,
+  charities = NULL
 ) {
 
   check_class(combine_GVA_long)
   check_class(GVA)
   check_class(tourism)
 
+
   GVA_by_sector <- dplyr::group_by(combine_GVA_long, year, sector) %>%
     summarise(GVA = sum(BB16_GVA)) %>%
 
-    #append total UK GVA
+    #append total UK GVA by year
     bind_rows(
       filter(GVA, SIC == "year_total") %>%
         mutate(sector = "UK") %>%
@@ -51,22 +53,39 @@ GVA_by_sector <- function(
     bind_rows(
       mutate(tourism, sector = "tourism") %>%
         select(year, sector, GVA)
+    ) %>%
+
+    #append charitites data
+    bind_rows(
+      mutate(charities, sector = "charities") %>%
+        select(year, sector, GVA)
     )
 
   #add overlap info from tourism in order to calculate GVA for sector=all_dcms
   tourism_all_sectors <- mutate(tourism, sector = "all_dcms") %>%
     select(year, sector, overlap)
 
+  #add overlap info from tourism in order to calculate GVA for sector=all_dcms
+  charities_all_sectors <- mutate(charities, sector = "all_dcms") %>%
+    select(year, sector, overlap)
+
+
   GVA_by_sector <-
     left_join(GVA_by_sector, tourism_all_sectors, by = c("year", "sector")) %>%
     ungroup() %>%
     mutate(GVA = ifelse(!is.na(overlap), overlap + GVA, GVA)) %>%
-    select(-overlap) %>%
+    select(-overlap)
+
+  GVA_by_sector <-
+    left_join(GVA_by_sector, charities_all_sectors, by = c("year", "sector")) %>%
+    ungroup() %>%
+    mutate(GVA = ifelse(!is.na(overlap), overlap + GVA, GVA)) %>%
+    select(-overlap)
 
     #final clean up
+  GVA_by_sector <- GVA_by_sector %>%
     filter(year %in% 2010:max(attr(combine_GVA_long, "years"))) %>%
-    mutate(GVA = round(GVA, 2),
-           sector = factor(sector),
+    mutate(sector = factor(sector),
            year = as.integer(year)) %>%
     select(sector, year, GVA) %>%
     arrange(year, sector)
@@ -77,8 +96,9 @@ GVA_by_sector <- function(
   # )
 
   sectors_set <- c(
+    "charities"   = "Civil Society (Non-market charities)",
     "creative"    = "Creative Industries",
-    "culture"    = "Cultural Sector",
+    "culture"     = "Cultural Sector",
     "digital"     = "Digital Sector",
     "gambling"    = "Gambling",
     "sport"       = "Sport",

diff --git a/R/GVA_by_sub_sector.R b/R/GVA_by_sub_sector.R
@@ -0,0 +1,80 @@
+#' @title Calculate GVA by sector
+#'
+#' @description Combines datasets exracted from the underlying spreadsheet using
+#'   the \code{extract_XXX} functions.
+#'
+#'   NOTE: THIS FUNCTION RELIES ON DATA WHICH ARE CLASSIFIED AS
+#'   OFFICIAL-SENSITIVE. THE OUTPUT OF THIS FUNCTION IS AGGREGATED, AND
+#'   PUBLICALLY AVAILABLE IN THE FINAL STATISTICAL RELEASE, HOWEVER CARE MUST BE
+#'   EXERCISED WHEN CREATING A PIPELINE INCLUDING THIS FUNCTION. IT IS HIGHLY
+#'   ADVISEABLE TO ENSURE THAT THE DATA WHICH ARE CREATED BY THE \code{extract_}
+#'   FUNCTIONS ARE NOT STORED IN A FOLDER WHICH IS A GITHUB REPOSITORY TO
+#'   MITIGATE AGAINST ACCIDENTAL COMMITTING OF OFFICIAL DATA TO GITHUB. TOOLS TO
+#'   FURTHER HELP MITIGATE THIS RISK ARE AVAILABLE AT
+#'   https://github.com/ukgovdatascience/dotfiles.
+#'
+#' @details The best way to understand what happens when you run this function
+#'   is to view the source code.
+#'
+#'
+#' @param combine_GVA_long data output from \code{eesectors::combine_GVA_long()}.
+#' @param GVA ABS data as extracted by \code{eesectors::extract_GVA_data()}.
+#'
+#' @export
+#'
+#' @import dplyr
+
+
+GVA_by_sub_sector <- function(
+  combine_GVA_long = combine_GVA_long,
+  GVA = GVA,
+  sub_sector = NULL
+) {
+
+  check_class(combine_GVA_long)
+  check_class(GVA)
+
+  GVA_by_sub_sector <- combine_GVA_long %>%
+    filter(sector == sub_sector) %>%
+    group_by(year, sub_sector_categories) %>%
+    summarise(GVA = sum(BB16_GVA)) %>%
+    ungroup() %>%
+
+    #append total sector GVA by year
+    bind_rows(
+      combine_GVA_long %>%
+        filter(sector == sub_sector) %>%
+        group_by(year) %>%
+        summarise(GVA = sum(BB16_GVA)) %>%
+        mutate(sub_sector_categories =
+                 paste0(toupper(substr(sub_sector, 1, 1)),
+                       substr(sub_sector, 2, nchar(sub_sector)))) %>%
+        select(year, sub_sector_categories, GVA)
+    ) %>%
+
+    #append total UK GVA by year
+    bind_rows(
+      filter(GVA, SIC == "year_total") %>%
+        mutate(sub_sector_categories = "UK") %>%
+        select(year, sub_sector_categories, GVA)
+    ) %>%
+
+    #final clean up
+    filter(year %in% 2010:max(attr(combine_GVA_long, "years"))) %>%
+    mutate(#GVA = round(GVA, 2),
+           #sub_sector_categories = factor(sub_sector_categories),
+           year = as.integer(year)) %>%
+    select(sub_sector_categories, year, GVA) %>%
+    arrange(year, sub_sector_categories)
+
+  #check "tbl_df"     "tbl"        "data.frame" is class(gva_by sub sector)
+  # structure(
+  #   GVA_by_sector,
+  #   class = c("GVA_by_sector", class(combine_GVA_long)[-1])
+  # )
+
+  structure(
+    GVA_by_sub_sector,
+    years = sort(unique(GVA_by_sub_sector$year)),
+    class = c("GVA_by_sector", class(GVA_by_sub_sector)))
+}
diff --git a/R/combine_GVA_long.R b/R/combine_GVA_long.R
@@ -34,34 +34,46 @@ combine_GVA_long <- function(
   ABS = NULL,
   GVA = NULL,
   SIC91 = NULL,
-  DCMS_sectors = eesectors::DCMS_sectors) {
+  DCMS_sectors = eesectors::sic_mappings) {
 
   check_class(ABS)
   check_class(GVA)
   check_class(SIC91)
 
   abs_year <- max(attr(ABS, "years"))
-  #Annual business survey, duplicate 2014 data for 2015 and
-  #then duplicate non SIC91 then add SIC 91 with sales data
-  ABS_2015 <- filter(ABS, year == abs_year) %>%
-    mutate(year = abs_year + 1) %>%
 
-    #this line makes no sense to me - we are just duplicated rows we already
-    #have so surely it is redundant??
-    bind_rows(filter(ABS, !SIC %in% unique(SIC91$SIC))) %>%
+  # #Annual business survey, duplicate 2014 data for 2015 and
+  # #then duplicate non SIC91 then add SIC 91 with sales data
+  # ABS_2015 <- filter(ABS, year == abs_year) %>%
+  #   mutate(year = abs_year + 1) %>%
+  #
+  #   #this line makes no sense to me - we are just duplicated rows we already
+  #   #have so surely it is redundant??
+  #   bind_rows(filter(ABS, !SIC %in% unique(SIC91$SIC))) %>%
+  #
+  #   #simply appending SIC sales data which supplements the ABS for SIC 91
+  #   bind_rows(SIC91)
+
+  ABS_2015 <- ABS %>%
+    filter(!SIC %in% unique(SIC91$SIC)) %>%
 
     #simply appending SIC sales data which supplements the ABS for SIC 91
     bind_rows(SIC91)
 
+  ABS_2015 <-
+    bind_rows(
+      ABS_2015,
+      filter(ABS_2015, year == abs_year) %>%
+      mutate(year = abs_year + 1))
 
   # keep cases from ABS which have integer SIC - which is just a higher level SIC
-  denom <- filter(ABS_2015, SIC %in% unique(eesectors::DCMS_sectors$SIC2)) %>%
+  denom <- filter(ABS_2015, SIC %in% unique(eesectors::sic_mappings$SIC2)) %>%
     select(year, ABS, SIC) %>%
     rename(ABS_2digit_GVA = ABS, SIC2 = SIC)
 
 
   #add ABS to DCMS sectors
-  GVA_sectors <- left_join(eesectors::DCMS_sectors, ABS_2015, by = c('SIC')) %>%
+  GVA_sectors <- left_join(eesectors::sic_mappings, ABS_2015, by = c('SIC')) %>%
     rename(ABS_ind_GVA = ABS) %>%
     #drop cases where SIC is not in that sector - should do when building DCMS_sectors
     filter(present == TRUE) %>%

diff --git a/R/extract_ABS_data.R b/R/extract_ABS_data.R
@@ -102,6 +102,9 @@ extract_ABS_data <- function(
   } else
       stop("Invalid format argument")
 
+  #remove duplicate SIC 92
+  df <- df[-149, ]
+
   #determine most recent year of data
   years <- suppressWarnings(as.numeric(colnames(df)))
   years <- min(years[!is.na(years)]):max(years[!is.na(years)])

diff --git a/R/extract_GVA_data.R b/R/extract_GVA_data.R
@@ -61,6 +61,7 @@ extract_GVA_data <- function(
       col_types = rep("numeric",dend[2]))
   )
 
+  # subset data to the table we want
   data2 <- data.frame(t(data[dstart[1]:dend[1], (dstart[2] + 1):dend[2]]))
   colnames(data2) <- unlist(data[dstart[1]:dend[1], dstart[2]])
 
@@ -71,9 +72,10 @@ extract_GVA_data <- function(
 
   SIC <- as.character(t(SIC[cnames, (dstart[2] + 1):dend[2]]))
   gva <- cbind(SIC, data2, stringsAsFactors = FALSE)
+  gva[1, 1] <- "year_total"
 
   gva <- gva %>%
-    filter(SIC %in% eesectors::DCMS_sectors$SIC2)
+    filter(SIC %in% eesectors::DCMS_sectors$SIC2 | SIC == "year_total")
 
 
   #determine most recent year of data
@@ -83,8 +85,8 @@ extract_GVA_data <- function(
   #check for missing columns
   na_col_test(gva)
 
-  #check number of SIC codes in dataset
-  if (nrow(gva) != length(unique((eesectors::DCMS_sectors$SIC2))))
+  #check number of SIC codes in dataset (+1 for year_total)
+  if (nrow(gva) != length(unique((eesectors::DCMS_sectors$SIC2))) + 1)
     stop(
       paste0(
         "GVA data has rows for ",
@@ -99,15 +101,6 @@ extract_GVA_data <- function(
     mutate(year = as.integer(year)) %>%
     as.tbl()
 
-  # add total of SICs for each year
-  totals <- gva2 %>%
-    group_by(year) %>%
-    summarise(GVA = sum(GVA)) %>%
-    mutate(SIC = "year_total") %>%
-    select(SIC, year, GVA)
-
-  gva2 <- rbind(gva2, totals)
-
   #check columns names
   if(
     !identical(

diff --git a/R/extract_charities_data.R b/R/extract_charities_data.R
@@ -0,0 +1,94 @@
+#' @title extract toursim Data ONS working file spreadsheet
+#'
+#' @description The data which underlies the Economic Sectors for DCMS sectors
+#'   data is typically provided to DCMS as a spreadsheet from the Office for
+#'   National Statistics. This function extracts the tourism data from that
+#'   spreadsheet, and saves it to .Rds format. These data are provided as the
+#'   usual tourism values in the GVA dataset cannot be used.
+#'
+#'   IT IS HIGHLY ADVISEABLE TO ENSURE THAT THE DATA WHICH ARE CREATED BY THIS
+#'   FUNCTION ARE NOT STORED IN A FOLDER WHICH IS A GITHUB REPOSITORY TO
+#'   MITIGATE AGAINST ACCIDENTAL COMMITTING OF OFFICIAL DATA TO GITHUB. TOOLS TO
+#'   FURTHER HELP MITIGATE THIS RISK ARE AVAILABLE AT
+#'   https://github.com/ukgovdatascience/dotfiles.
+#'
+#' @details The best way to understand what happens when you run this function
+#'   is to look at the source code, which is available at
+#'   \url{https://github.com/ukgovdatascience/eesectors/blob/master/R/}. The
+#'   code is relatively transparent and well documented. A brief explanation of
+#'   what the function does here:
+#'
+#'   1. The function calls \code{readxl::read_excel} to load the appropriate
+#'   page from the underlying spreadsheet.
+#'
+#'   2. Sanitise the \code{colnames} using a user-supplied vector in
+#'   \code{new_colnames}. If there are no changes to the 2016 spreadhseet, in
+#'   future years, then the default vector should work in future years. If there
+#'   have been changes, this is likely to be a cause of errors.
+#'
+#'   3. Empty rows (containing all \code{NA}s) are removed.
+#'
+#'   4. The data are saved out to an R serialisation object
+#'   \code{OFFICIAL_tourism.Rds} in the specified folder.
+#'
+#' @param x Location of the input spreadsheet file. Named something like
+#'   "working_file_dcms_VXX.xlsm".
+#' @param sheet_name The name of the spreadsheet in which the data are stored.
+#'   Defaults to \code{New ABS Data}.
+#' @param col_names character vector used to rename the column names from the
+#'   imported spreadsheet. Defaults to
+#'   \code{c('year','gva','total','perc','overlap')}.
+#'
+#' @return The function returns nothing, but saves the extracted dataset to
+#'   \code{file.path(output_path, 'OFFICIAL_ABS.Rds')}. This is an R data
+#'   object, which retains the column types which would be lost if converted to
+#'   a flat format like CSV.
+#'
+#' @examples
+#'
+#' \dontrun{
+#' library(eesectors)
+#' extract_toursim_data(
+#' x = 'OFFICIAL_working_file_dcms_V13.xlsm',
+#' sheet_name = 'Tourism'
+#' )
+#' }
+#'
+#' @export
+
+extract_charities_data <- function(
+  x,
+  sheet_name = 'Charities',
+  col_names = c('year','GVA','total','perc','overlap')
+) {
+
+  # Load the data using readr. Note that additional arguments (e.g. skip, and
+  # colnames) can be passed to read_excel using the ... operator
+
+  df <- readxl::read_excel(path = x, sheet = sheet_name, col_names = TRUE, skip = 1)
+
+  # Standardise the column names.
+
+  colnames(df) <- col_names
+
+  # Remove the extraneous rows, byt first checking whether they are all NA.
+  df2 <- df[1:7, 1:5]
+
+  # Return the data to the console.
+
+  message(
+    '################################# WARNING #################################
+    The data produced by this function may contain OFFICIAL information.
+    Ensure that the data are not committed to a github repository.
+    Tools to prevent the accidental committing of data are available at:
+    https://github.com/ukgovdatascience/dotfiles. Pay special attention
+    to .Rdata files, and .Rhistory files produced by Rstudio. Best practice
+    is to disable the creation of such files.'
+  )
+
+  structure(
+    df2,
+    class = c("charities", class(df2))
+  )
+
+}
diff --git a/R/extract_tourism_data.R b/R/extract_tourism_data.R
@@ -79,6 +79,8 @@ extract_tourism_data <- function(
 
   x <- x[mask,]
 
+  x <- x[, 1:5]
+
   # Return the data to the console.
 
   message(