diff --git a/COVID19_deaths b/COVID19_deaths new file mode 100644 index 0000000..f2fe5ce --- /dev/null +++ b/COVID19_deaths @@ -0,0 +1,509 @@ +--- +title: "PIC data" +output: html_document +date: "2023-04-07" +--- + +Make sure these packages are installed +```{r setup, include=FALSE} +library(lubridate) +library(tidyverse) +library(httr) +# httr::set_config(config(ssl_verifypeer = 0L)) +library(jsonlite) +library(keyring) +library(janitor) +#instead of lapply, purr can be much faster (from training video) +library(purrr) +# for string manipulation - i.e. replace a character with something else or detect a charter etc +library(stringr) +library(rvest) +library(magrittr) +#Load devtools +library(devtools) +library(janitor) +library(sqldf) +library(readr) +library(tools) +library(tibble) +library(dplyr) +library(stats) +library(sf) +library(utils) +library(cdcfluview) +``` + +Run "aaa.r" file first +```{r setup, include=FALSE} +utils::globalVariables(c(".", "mmwrid", "season", "seasonid", "week_start", "wk_start", "wk_end", "year_wk_num")) + +# CDC U.S. region names to ID map +.region_map <- c(national=3, hhs=1, census=2, state=5) + +# CDC hospital surveillance surveillance area name to internal pkg use map +.surv_map <- c(`FluSurv-NET`="flusurv", `EIP`="eip", `IHSP`="ihsp") +.surv_rev_map <- c(flusurv="FluSurv-NET", eip="EIP", ihsp="IHSP") + +# CDC P&I mortality GepID mapping +.geoid_map <- c(national="1", state="2", region="3") + +# Our bot's user-agent string +.cdcfluview_ua <- "Mozilla/5.0 (compatible; R-cdcvluview Bot/2.0; https://github.com/hrbrmstr/cdcfluview)" + +# CDC Basemaps +.national_outline <- "https://gis.cdc.gov/grasp/fluview/FluView2References/Data/US_84.json" +.hhs_subregions_basemap <- "https://gis.cdc.gov/grasp/fluview/FluView2References/Data/HHSRegions_w_SubGroups.json" +.census_divisions_basemap <- "https://gis.cdc.gov/grasp/fluview/FluView2References/Data/CensusDivs_w_SubGroups.json" +.states_basemap <- "https://gis.cdc.gov/grasp/fluview/FluView2References/Data/StatesFluView.json" +.spread_basemap <- "https://gis.cdc.gov/grasp/fluview/FluView8References/Data/States_Territories_labels.json" +.surv_basemap <- "https://gis.cdc.gov/grasp/fluview/FluView1References/data/US_States_w_PR_labels.json" + +# CDC Age Groups +.age_grp <- c("0-4 yr", "5-24 yr", "25-64 yr", "65+ yr") + +# CDC Virus Groups +.vir_grp <- c("A (Subtyping not Performed)", "A (H1N1)pdm09", "A (Unable to Subtype)", + "B (Lineage Unspecified)", "A (H1)", "A (H3)", "B (Victoria Lineage)", + "B (Yamagata Lineage)", "H3N2v") + +# Global HTTR timeout +.httr_timeout <- 120 + +.get_meta <- function() { + + + list( + appversion = jsonlite::unbox("Public"), + key = jsonlite::unbox(""), + injson = I(list()) + ) -> body + + httr::POST( + httr::user_agent(.cdcfluview_ua), + url = "https://gis.cdc.gov/GRASP/Flu3/PostPhase03DataTool", + body = jsonlite::toJSON(body), + encode = "raw", + httr::accept_json(), + httr::add_headers( + `content-type` = "application/json;charset=UTF-8", + origin = "https://gis.cdc.gov", + referer = "ttps://gis.cdc.gov/GRASP/Fluview/FluHospRates.html" + ), + httr::timeout(.httr_timeout) + ) -> res + + httr::stop_for_status(res) + + jsonlite::fromJSON(httr::content(res, as = "text")) + +} + +``` + +Run "mmwr-map.r" file second +```{r setup, include=FALSE} +# THIS IS NOT EXPORTED FROM MMWRweek but I need it +# Find start date for a calendar year +# +# Finds the state date given a numeric calendar year +# @author Jarad Niemi \email{niemi@@iastate.edu} +.start_date = function(year) { + # Finds start state for this calendar year + # Fix by @bastistician + jan1 <- as.Date(paste0(year, '-01-01')) + wday <- as.numeric(strftime(jan1, "%w")) # Sunday is 0 + jan1 - wday + 7*(wday>3) +} + +# I discovered why 1962!: https://www.cdc.gov/mmwr/preview/mmwrhtml/su6004a9.htm +.tmp <- lapply(1962:2050, .start_date) + +mapply(function(.x, .y) { + tibble::tibble( + wk_start = seq(.tmp[[.x]], .tmp[[.y]], "1 week"), + wk_end = wk_start + 6, + year_wk_num = 1:length(wk_start) + ) -> tmp + tmp[-nrow(tmp),] +}, 1:(length(.tmp)-1), 2:length(.tmp), SIMPLIFY=FALSE) -> mmwrid_map + +mmwrid_map <- Reduce(rbind.data.frame, mmwrid_map) +mmwrid_map$mmwrid <- 1:nrow(mmwrid_map) + +#' @title MMWR ID to Calendar Mappings +#' @md +#' @description The CDC uses a unique "Morbidity and Mortality Weekly Report" identifier +#' for each week that starts at 1 (Ref: < https://www.cdc.gov/mmwr/preview/mmwrhtml/su6004a9.htm>). +#' This data frame consists of 4 columns: +#' - `wk_start`: Start date (Sunday) for the week (`Date`) +#' - `wk_end`: End date (Saturday) for the week (`Date`) +#' - `year_wk_num`: The week of the calendar year +#' - `mmwrid`: The unique MMWR identifier +#' These can be "left-joined" to data provided from the CDC to perform MMWR identifier +#' to date mappings. +#' @docType data +#' @name mmwrid_map +#' @format A data frame with 4,592 rows and 4 columns +#' @export +NULL + +#' Convert a Date to an MMWR day+week+year +#' +#' This is a reformat and re-export of a function in the `MMWRweek` package. +#' It provides a snake case version of its counterpart, produces a `tibble` +#' +#' @md +#' @param x a vector of `Date` objects or a character vector in `YYYY-mm-dd` format. +#' @return data frame (tibble) +#' @export +#' @examples +#' mwk <- mmwr_week(Sys.Date()) +mmwr_week <- function(x) { + x <- as.Date(x) + x <- setNames(MMWRweek::MMWRweek(x), c("mmwr_year", "mmwr_week", "mmwr_day")) + class(x) <- c("tbl_df", "tbl", "data.frame") + x +} + +#' Convert a Date to an MMWR weekday +#' +#' This is a reformat and re-export of a function in the `MMWRweek` package. +#' It provides a snake case version of its counterpart, produces a `factor` of +#' weekday names (Sunday-Saturday). +#' +#' @md +#' @note Weekday names are explicitly mapped to "Sunday-Saturday" or "Sun-Sat" and +#' do not change with your locale. +#' @param x a vector of `Date` objects or a character vector in `YYYY-mm-dd` format. +#' @param abbr (logical) if `TRUE`, return abbreviated weekday names, otherwise full +#' weekday names (see Note). +#' @return ordered factor +#' @export +#' @examples +#' mwday <- mmwr_weekday(Sys.Date()) +mmwr_weekday <- function(x, abbr = FALSE) { + x <- as.Date(x) + x <- MMWRweek::MMWRweekday(x) + if (abbr) { + x <- ordered( + x, + levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"), + labels = c("Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat") + ) + } + x +} + +#' Convert an MMWR year+week or year+week+day to a Date object +#' +#' This is a reformat and re-export of a function in the `MMWRweek` package. +#' It provides a snake case version of its counterpart and produces a vector +#' of `Date` objects that corresponds to the input MMWR year+week or year+week+day +#' vectors. This also adds some parameter checking and cleanup to avoid exceptions. +#' +#' @md +#' @param year,week,day Year, week and month vectors. All must be the same length +#' unless `day` is `NULL`. +#' @return vector of `Date` objects +#' @export +#' @examples +#' mwd <- mmwr_week_to_date(2016,10,3) +mmwr_week_to_date <- function(year, week, day=NULL) { + + year <- as.numeric(year) + week <- as.numeric(week) + day <- if (!is.null(day)) as.numeric(day) else rep(1, length(week)) + + week <- ifelse(0 < week & week < 54, week, NA) + + as.Date(ifelse(is.na(week), NA, MMWRweek::MMWRweek2Date(year, week, day)), + origin="1970-01-01") + +} +``` + +Run "agd-ipt.r" file Third +```{r setup, include=FALSE} +#' Age Group Distribution of Influenza Positive Tests Reported by Public Health Laboratories +#' +#' Retrieves the age group distribution of influenza positive tests that are reported by +#' public health laboratories by influenza virus type and subtype/lineage. Laboratory data +#' from multiple seasons and different age groups is provided. +#' +#' @md +#' @references +#' - [CDC FluView Portal](https://gis.cdc.gov/grasp/fluview/fluportaldashboard.html) +#' - [AGD IPT Portal](https://gis.cdc.gov/grasp/fluview/flu_by_age_virus.html) +#' @param years a vector of years to retrieve data for (i.e. `2014` for CDC +#' flu season 2014-2015). CDC has data for this API going back to 1997. +#' Default value (`NULL`) means retrieve **all** years. NOTE: if you +#' happen to specify a 2-digit season value (i.e. `57` == 2017-2018) +#' the function is smart enough to retrieve by season ID vs convert that +#' to a year. +#' @export +#' @examples +#' agd <- age_group_distribution(years=2015) +age_group_distribution <- function(years = NULL) { + + httr::GET( + url = "https://gis.cdc.gov/grasp/fluView6/GetFlu6AllDataP", + httr::user_agent(.cdcfluview_ua), + httr::add_headers( + Accept = "application/json, text/plain, */*", + Referer = "https://gis.cdc.gov/grasp/fluview/flu_by_age_virus.html" + ), + # httr::verbose(), + httr::timeout(.httr_timeout) + ) -> res + + httr::stop_for_status(res) + + xdat <- httr::content(res, as="parsed") + xdat <- jsonlite::fromJSON(xdat, flatten=TRUE) + + sea_names <- c("seasonid", "sea_description", "sea_startweek", "sea_endweek", "sea_enabled", + "sea_label", "sea_showlabtype", "incl_wkly_rates_and_strata") + age_names <- c("ageid", "age_label", "age_color_hexvalue", "age_enabled") + typ_names <- c("virusid", "vir_description", "vir_label", "vir_startmmwrid", "vir_endmmwrid", + "vir_displayorder", "vir_colorname", "vir_color_hexvalue", "vir_labtypeid", + "vir_sortid") + vir_names <- c("virusid", "ageid", "count", "mmwrid", "seasonid") + + sea_df <- stats::setNames(xdat$Season, sea_names) + age_df <- stats::setNames(xdat$Age, age_names) + typ_df <- stats::setNames(xdat$VirusType, typ_names) + vir_df <- stats::setNames(xdat$VirusData, vir_names) + + vir_df <- dplyr::left_join(vir_df, sea_df, "seasonid") + vir_df <- dplyr::left_join(vir_df, age_df, "ageid") + vir_df <- dplyr::left_join(vir_df, typ_df, "virusid") + + class(vir_df) <- c("tbl_df", "tbl", "data.frame") + + vir_df_cols <- c("sea_label", "age_label", "vir_label", "count", "mmwrid", "seasonid", + "sea_description", "sea_startweek", + "sea_endweek", "vir_description", "vir_startmmwrid", "vir_endmmwrid") + + vir_df <- vir_df[,vir_df_cols] + + vir_df$age_label <- factor(vir_df$age_label, levels=.age_grp) + vir_df$vir_label <- factor(vir_df$vir_label, levels=.vir_grp) + + vir_df <- dplyr::left_join(vir_df, mmwrid_map, "mmwrid") + + available_seasons <- sort(unique(vir_df$seasonid)) + + if (!is.null(years)) { # specified years or seasons or a mix + + years <- as.numeric(years) + years <- ifelse(years > 1996, years - 1960, years) + years <- sort(unique(years)) + years <- years[years %in% available_seasons] + + if (length(years) == 0) { + years <- rev(available_seasons)[1] + curr_season_descr <- vir_df[vir_df$seasonid == years,]$sea_description[1] + message(sprintf("No valid years specified, defaulting to this flu season => ID: %s [%s]", + years, curr_season_descr)) + } + + vir_df <- dplyr::filter(vir_df, seasonid %in% years) + + } + + vir_df + +} +``` + +Run "utils.r" file Fourth +```{r setup, include=FALSE} +.mcga <- function(tbl) { + + x <- colnames(tbl) + x <- tolower(x) + x <- gsub("[[:punct:][:space:]]+", "_", x) + x <- gsub("_+", "_", x) + x <- gsub("(^_|_$)", "", x) + x <- gsub("^x_", "", x) + x <- make.unique(x, sep = "_") + + colnames(tbl) <- x + + tbl + +} + +to_num <- function(x) { + x <- gsub("%", "", x, fixed=TRUE) + x <- gsub(">", "", x, fixed=TRUE) + x <- gsub("<", "", x, fixed=TRUE) + x <- gsub(",", "", x, fixed=TRUE) + x <- gsub(" ", "", x, fixed=TRUE) + suppressWarnings(as.numeric(x)) +} +``` + +Run "pi-mortality.r" file Fifth +1. I Changed "pi_mortality" to "pic_mortality" in line 351 +2. I added the following variables to xdf in line 440 "number_covid19", "percent_pic","total_pic" +```{r setup, include=FALSE} +pic_mortality <- function(coverage_area=c("national", "state", "region"), years=NULL) { + + coverage_area <- match.arg(tolower(coverage_area), choices = c("national", "state", "region")) + + us_states <- read.csv("https://gis.cdc.gov/grasp/fluview/Flu7References/Data/USStates.csv", + stringsAsFactors=FALSE) + us_states <- setNames(us_states, c("region_name", "subgeoid", "state_abbr")) + us_states <- us_states[,c("region_name", "subgeoid")] + us_states$subgeoid <- as.character(us_states$subgeoid) + + meta <- jsonlite::fromJSON("https://gis.cdc.gov/grasp/flu7/GetPhase07InitApp?appVersion=Public") + + mapcode_df <- setNames(meta$nchs_mapcode[,c("mapcode", "description")], c("map_code", "callout")) + mapcode_df$map_code <- as.character(mapcode_df$map_code) + + geo_df <- meta$nchs_geo_dim + geo_df$geoid <- as.character(geo_df$geoid) + + age_df <- setNames(meta$nchs_ages, c("ageid", "age_label")) + age_df$ageid <- as.character(age_df$ageid) + + sum_df <- meta$nchs_summary + sum_df$seasonid <- as.character(sum_df$seasonid) + sum_df$ageid <- as.character(sum_df$ageid) + sum_df$geoid <- as.character(sum_df$geoid) + + available_seasons <- sort(meta$seasons$seasonid) + + if (is.null(years)) { # ALL YEARS + years <- available_seasons + } else { # specified years or seasons or a mix + + years <- as.numeric(years) + years <- ifelse(years > 1996, years - 1960, years) + years <- sort(unique(years)) + years <- years[years %in% available_seasons] + + if (length(years) == 0) { + years <- rev(sort(meta$seasons$seasonid))[1] + curr_season_descr <- meta$seasons[meta$seasons$seasonid == years, "description"] + message(sprintf("No valid years specified, defaulting to this flu season => ID: %s [%s]", + years, curr_season_descr)) + } + + } + + httr::POST( + url = "https://gis.cdc.gov/grasp/flu7/PostPhase07DownloadData", + httr::user_agent(.cdcfluview_ua), + httr::add_headers( + Origin = "https://gis.cdc.gov", + Accept = "application/json, text/plain, */*", + Referer = "https://gis.cdc.gov/grasp/fluview/mortality.html" + ), + encode = "json", + body = list( + AppVersion = "Public", + AreaParameters = list(list(ID=.geoid_map[coverage_area])), + SeasonsParameters = lapply(years, function(.x) { list(ID=as.integer(.x)) }), + AgegroupsParameters = list(list(ID="1")) + ), + # httr::verbose(), + httr::timeout(.httr_timeout) + ) -> res + + httr::stop_for_status(res) + + res <- httr::content(res, as="parsed", flatten=TRUE) + + suppressWarnings(suppressMessages(dplyr::bind_rows(res$seasons) %>% + dplyr::left_join(mapcode_df, "map_code") %>% + dplyr::left_join(geo_df, "geoid") %>% + dplyr::left_join(age_df, "ageid") %>% + dplyr::left_join(dplyr::mutate(mmwrid_map, mmwrid=as.character(mmwrid)), "mmwrid") -> xdf)) + + xdf <- dplyr::mutate(xdf, coverage_area = coverage_area) + + if (coverage_area == "state") { + xdf <- dplyr::left_join(xdf, us_states, "subgeoid") + } else if (coverage_area == "region") { + xdf$region_name <- sprintf("Region %s", xdf$subgeoid) + } else { + xdf$region_name <- "national" + } + + xdf[,c("seasonid", "baseline", "threshold", "percent_pni", + "percent_complete", "number_influenza", "number_pneumonia", + "all_deaths", "Total_PnI", "weeknumber", "geo_description", + "age_label", "wk_start", "wk_end", "year_wk_num", "mmwrid", + "coverage_area", "region_name", "callout", "number_covid19", "percent_pic","total_pic")] -> xdf + + xdf$baseline <- to_num(xdf$baseline) / 100 + xdf$threshold <- to_num(xdf$threshold) / 100 + xdf$percent_pni <- to_num(xdf$percent_pni) / 100 + xdf$percent_complete <- to_num(xdf$percent_complete) / 100 + xdf$number_influenza <- to_num(xdf$number_influenza) + xdf$number_pneumonia <- to_num(xdf$number_pneumonia) + xdf$number_covid19 <- to_num(xdf$number_covid19) + xdf$all_deaths <- to_num(xdf$all_deaths) + xdf$total_pic <- to_num(xdf$total_pic) + xdf$Total_PnI <- to_num(xdf$Total_PnI) + xdf$percent_pic <- to_num(xdf$percent_pic) + xdf <- xdf %>% + dplyr::rename( + week_start = wk_start, + week_end = wk_end, + year_week_num = year_wk_num + ) + xdf <- .mcga(xdf) + + xdf + +} +``` + +After running the updated package code above, I created the below report which uses the new PIC_mortality package and SQL code to organize the variables from the package into a customized table +```{r echo = T, results = 'hide'} +#for next season (2023-2024) change years to "years = 2017:2023" +nat_pic <- pic_mortality(coverage_area = c("national"), years = 2017:2022) +region_pic <- pic_mortality(coverage_area = c("Region"), years = 2017:2022) +st_pic <- pic_mortality(coverage_area = c("state"), years = 2017:2022) + +#National Mortality file. +#ADD NEW MMWRID WHEN NEW SEASON STARTS - The range for each season ID should be 51 unless its a year with 53 weeks then it will be 52 +#I think this is how it should look for future seasons -> '2023-2024' = 3223 to 3274; '2024-2025' = 3275 to 3326; '2025-2026' = 3327 to 3379 +nat_pic_mort <- sqldf( + " + SELECT + geo_description as 'AREA', + case + when geo_description = 'National' then ' ' + else geo_description end as 'SUB AREA', + age_label as 'AGE GROUP', + case + when mmwrid between '2910' and '2961' then '2017-2018' + when mmwrid between '2962' and '3013' then '2018-2019' + when mmwrid between '3014' and '3065' then '2019-2020' + when mmwrid between '3066' and '3118' then '2020-2021' + when mmwrid between '3119' and '3170' then '2021-2022' + when mmwrid between '3171' and '3222' then '2022-2023' + else mmwrid end as 'SEASON', + year_week_num as 'WEEK', + threshold*100 as 'THRESHOLD', + baseline*100 as 'BASELINE', + percent_pic as 'PERCENT PIC', + number_influenza as 'NUM INFLUENZA DEATHS', + number_pneumonia as 'NUM PNEUMONIA DEATHS', + number_covid19 as 'NUM COVID-19 DEATHS', + total_pic as 'TOTAL PIC', + all_deaths as 'TOTAL DEATHS', + percent_complete*100 as 'PERCENT COMPLETE' + from + nat_pic + Order by Season asc + " + ) + +``` +