From c0041952b8e173cec0f84ef1278dc98be993eb02 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 30 Apr 2026 13:32:00 +0200 Subject: [PATCH] rebase from merge and push our changes --- .../snt_dhis2_formatting_population.ipynb | 46 +- .../code/snt_dhis2_formatting_pyramid.ipynb | 195 +- ...snt_dhis2_formatting_reporting_rates.ipynb | 42 +- .../code/snt_dhis2_formatting_routine.ipynb | 33 +- .../code/snt_dhis2_formatting_shapes.ipynb | 40 +- .../snt_dhis2_formatting_report.ipynb | 1855 +++++------------ .../utils/snt_dhis2_formatting.r | 445 +++- .../utils/snt_dhis2_formatting_report.r | 505 ++++- 8 files changed, 1536 insertions(+), 1625 deletions(-) diff --git a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_population.ipynb b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_population.ipynb index 89de2b5..96104ee 100644 --- a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_population.ipynb +++ b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_population.ipynb @@ -12,11 +12,15 @@ "cell_type": "code", "execution_count": null, "id": "52fae14b-d3bc-4a74-9c36-89e854e70636", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ - "source(file.path(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r\"))\n", - "setup_var <- get_setup_variables(packages= c(\"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\"))\n", + "source(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r\")\n", + "setup_var <- snt_setup(SNT_ROOT_PATH = \"~/workspace\", packages = c(\"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\"))\n", "\n", "# Load config\n", "config_json <- load_snt_config(file.path(setup_var$CONFIG_PATH, \"SNT_config.json\"))\n", @@ -49,7 +53,11 @@ "cell_type": "code", "execution_count": null, "id": "6febcacc-afff-4cbb-ac0d-bc2bb00edff4", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "dhis2_pyramid_data <- load_dataset_file(extracts_dataset_id, paste0(COUNTRY_CODE, \"_dhis2_raw_pyramid.parquet\"), verbose=FALSE)\n", @@ -77,7 +85,11 @@ "cell_type": "code", "execution_count": null, "id": "6b1cda01-c507-4fb4-85d2-1a568df219dc", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "dhis2_data <- load_dataset_file(extracts_dataset_id, paste0(COUNTRY_CODE, \"_dhis2_raw_population.parquet\"), verbose=FALSE)\n", @@ -100,7 +112,11 @@ "cell_type": "code", "execution_count": null, "id": "6e3f488b-95f1-4aff-b024-cf4e771fed63", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "population_table <- build_population_indicators(dhis2_data, dhis2_pyramid_data, config_json)\n", @@ -123,7 +139,11 @@ "cell_type": "code", "execution_count": null, "id": "34713812-ca2f-452b-8a15-6d471409f5ff", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "admin_cols <- get_admin_config(config_json)\n", @@ -146,7 +166,11 @@ "cell_type": "code", "execution_count": null, "id": "a32f4a3b-e9f1-4941-b2a2-80b4fa4eb2de", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "# Create pop template (util for pop transformation pipeline)\n", @@ -191,7 +215,11 @@ "cell_type": "code", "execution_count": null, "id": "59891b6e-490e-4a22-8909-416520d168da", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "# save template\n", diff --git a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_pyramid.ipynb b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_pyramid.ipynb index f8d5543..bfb5b66 100644 --- a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_pyramid.ipynb +++ b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_pyramid.ipynb @@ -12,11 +12,15 @@ "cell_type": "code", "execution_count": null, "id": "fca7918b-f8bf-4e39-9601-febf5ca7877a", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "source(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r\")\n", - "setup_var <- get_setup_variables(packages= c(\"lubridate\", \"zoo\", \"arrow\", \"dplyr\", \"stringi\", \"stringr\", \"jsonlite\", \"httr\", \"glue\"))\n", + "setup_var <- snt_setup(SNT_ROOT_PATH = \"~/workspace\", packages = c(\"lubridate\", \"zoo\", \"arrow\", \"dplyr\", \"stringi\", \"stringr\", \"jsonlite\", \"httr\", \"glue\"))\n", "\n", "# Load config\n", "config_json <- load_snt_config(file.path(setup_var$CONFIG_PATH, \"SNT_config.json\"))\n", @@ -67,7 +71,11 @@ "cell_type": "code", "execution_count": null, "id": "8faec80e-2b3e-4162-8767-d6c5c28eadcf", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "log_msg(glue(\"Start DHIS2 organisation units(pyramid) formatting.\")) \n", @@ -89,7 +97,11 @@ "cell_type": "code", "execution_count": null, "id": "076dacc5-f449-4ab1-9168-9dbc766d295f", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "# Extract lon/lat from geometry\n", @@ -102,57 +114,58 @@ "id": "c25113ed-090a-418c-9a5c-2b30e0f2f773", "metadata": {}, "source": [ - "### Try coordinates validation steps" + "### Coordinate validation" ] }, { "cell_type": "code", "execution_count": null, "id": "43d9f9dc-97fc-4083-bc17-e78d0037e02d", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ - "# Step 1 - Try Load country border from folder (if exists)\n", - "shapes_sf <- read_geojson_safe(file.path(\"~/workspace/data/dhis2/extracts_formatted/\" , paste0(COUNTRY_CODE, \"_shapes.geojson\")))" + "shapes_sf <- read_geojson_safe(file.path(\"~/workspace/data/dhis2/extracts_formatted/\", paste0(COUNTRY_CODE, \"_shapes.geojson\")))" ] }, { "cell_type": "code", "execution_count": null, "id": "f8d11c71-0921-4a1e-b005-737b18392383", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ - "# Step 2 - Keep original coordinates already inside the country (If shapes are available)\n", + "log_msg(\"Running coordinate boundary validation\")\n", "\n", - "if (!is.null(shapes_sf)) {\n", - " log_msg(\"Running coordinate boundary validation\")\n", - " \n", - " shapes_sf_boundary <- prepare_country_boundary(shapes_sf)\n", - " lon0 <- pyramid_data_coords$LONGITUDE\n", - " lat0 <- pyramid_data_coords$LATITUDE\n", - " within_original <- points_within_country_batch(lon0, lat0, shapes_sf_boundary)\n", - " has_coords <- !is.na(lon0) & !is.na(lat0)\n", - " \n", - " coord_fix_df <- tibble(\n", - " LONGITUDE_ORIGINAL = lon0,\n", - " LATITUDE_ORIGINAL = lat0,\n", - " LONGITUDE_FIXED = NA_real_,\n", - " LATITUDE_FIXED = NA_real_,\n", - " COORD_FIX_METHOD = NA_character_,\n", - " COORD_IS_VALID = FALSE\n", - " )\n", - " \n", - " ok_original <- has_coords & within_original \n", - " \n", - " if (any(ok_original)) {\n", - " coord_fix_df$LONGITUDE_FIXED[ok_original] <- lon0[ok_original]\n", - " coord_fix_df$LATITUDE_FIXED[ok_original] <- lat0[ok_original]\n", - " coord_fix_df$COORD_FIX_METHOD[ok_original] <- \"ORIGINAL\"\n", - " coord_fix_df$COORD_IS_VALID[ok_original] <- TRUE\n", - " }\n", - "} else {\n", - " log_msg(\"Skipped coordinate boundary validation: No reference shapes available.\")\n", + "shapes_sf_boundary <- prepare_country_boundary(shapes_sf)\n", + "lon0 <- pyramid_data_coords$LONGITUDE\n", + "lat0 <- pyramid_data_coords$LATITUDE\n", + "within_original <- points_within_country_batch(lon0, lat0, shapes_sf_boundary)\n", + "has_coords <- !is.na(lon0) & !is.na(lat0)\n", + "\n", + "coord_fix_df <- tibble(\n", + " LONGITUDE_ORIGINAL = lon0,\n", + " LATITUDE_ORIGINAL = lat0,\n", + " LONGITUDE_FIXED = NA_real_,\n", + " LATITUDE_FIXED = NA_real_,\n", + " COORD_FIX_METHOD = NA_character_,\n", + " COORD_IS_VALID = FALSE\n", + ")\n", + "\n", + "ok_original <- has_coords & within_original\n", + "\n", + "if (any(ok_original)) {\n", + " coord_fix_df$LONGITUDE_FIXED[ok_original] <- lon0[ok_original]\n", + " coord_fix_df$LATITUDE_FIXED[ok_original] <- lat0[ok_original]\n", + " coord_fix_df$COORD_FIX_METHOD[ok_original] <- \"ORIGINAL\"\n", + " coord_fix_df$COORD_IS_VALID[ok_original] <- TRUE\n", "}" ] }, @@ -160,29 +173,31 @@ "cell_type": "code", "execution_count": null, "id": "1cea34fe-cc7c-4116-b018-5b8c895bb980", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ - "# Step 3 - For remaining points, try correction sequence\n", - "if (!is.null(shapes_sf)) {\n", - " \n", - " miss <- !has_coords\n", - " if (any(miss)) {\n", - " coord_fix_df$COORD_FIX_METHOD[miss] <- \"MISSING_COORDINATES\"\n", - " }\n", + "miss <- !has_coords\n", + "if (any(miss)) {\n", + " coord_fix_df$COORD_FIX_METHOD[miss] <- \"MISSING_COORDINATES\"\n", + "}\n", + "\n", + "need_fix <- !within_original & has_coords\n", + "log_msg(glue(\"Found {sum(need_fix)} / {length(need_fix)} coordinates that require fixing.\"))\n", + "\n", + "fix_results <- list()\n", + "if (any(need_fix)) {\n", + " idx_fix <- which(need_fix)\n", + " fix_results <- lapply(idx_fix, function(i) {\n", + " fix_coordinate_pair_in_country(lon0[i], lat0[i], shapes_sf_boundary, max_shift = 2)\n", + " })\n", "\n", - " need_fix <- !within_original & has_coords \n", - " log_msg(glue(\"Found {sum(need_fix)} / {length(need_fix)} coordinates that require fixing.\"))\n", - " \n", - " if (any(need_fix)) {\n", - " idx_fix <- which(need_fix)\n", - " fix_results <- lapply(idx_fix, function(i) {\n", - " fix_coordinate_pair_in_country(lon0[i], lat0[i], shapes_sf_boundary, max_shift = 2)\n", - " })\n", - " \n", - " fixed_coords <- sum(sapply(fix_results, function(x) x$VALID == TRUE), na.rm = TRUE) \n", + " fixed_coords <- sum(sapply(fix_results, function(x) x$VALID == TRUE), na.rm = TRUE)\n", " log_msg(glue(\"Points corrected: {fixed_coords} out of {sum(need_fix)}\"))\n", - " \n", + "\n", " for (k in seq_along(idx_fix)) {\n", " i <- idx_fix[k]\n", " fr <- fix_results[[k]]\n", @@ -190,7 +205,6 @@ " coord_fix_df$LATITUDE_FIXED[i] <- fr$LATITUDE\n", " coord_fix_df$COORD_FIX_METHOD[i] <- fr$METHOD\n", " coord_fix_df$COORD_IS_VALID[i] <- fr$VALID\n", - " }\n", " }\n", "}" ] @@ -199,48 +213,49 @@ "cell_type": "code", "execution_count": null, "id": "eff4aff7-fe8d-4d47-b87f-b3fc8512d10e", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ - "# Display fixed points (if any)\n", - "if (exists(\"fixed_coords\") && length(fixed_coords) > 0) {\n", - " my_map <- plot_fixed_coordinates(fix_results, shapes_sf_boundary) \n", - "} " + "if (length(fix_results) > 0) {\n", + " my_map <- plot_fixed_coordinates(fix_results, shapes_sf_boundary)\n", + "}" ] }, { "cell_type": "code", "execution_count": null, "id": "49ef06a2-072d-4296-a220-6d5ab1948db0", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ - "# Step 6 - Apply final coordinates and flag invalids\n", - "if (!is.null(shapes_sf)) {\n", - " pyramid_data_coords$LONGITUDE <- coord_fix_df$LONGITUDE_FIXED\n", - " pyramid_data_coords$LATITUDE <- coord_fix_df$LATITUDE_FIXED\n", - " \n", - " invalid_coords <- pyramid_data_coords %>%\n", - " bind_cols(coord_fix_df %>% select(LONGITUDE_ORIGINAL, LATITUDE_ORIGINAL, COORD_FIX_METHOD, COORD_IS_VALID)) %>%\n", - " filter(!COORD_IS_VALID & !is.na(LONGITUDE_ORIGINAL) & !is.na(LATITUDE_ORIGINAL)) %>%\n", - " mutate(INVALID_COORD_REASON = \"NO_VALID_TRANSFORMATION_IN_COUNTRY\") %>% \n", - " select(-LONGITUDE, -LATITUDE)\n", - " \n", - " # Step 7 - Summary logs\n", - " n_total_coords <- sum(!is.na(coord_fix_df$LONGITUDE_ORIGINAL) & !is.na(coord_fix_df$LATITUDE_ORIGINAL))\n", - " n_kept_original <- sum(coord_fix_df$COORD_FIX_METHOD == \"ORIGINAL\", na.rm = TRUE)\n", - " n_corrected <- sum(coord_fix_df$COORD_IS_VALID & coord_fix_df$COORD_FIX_METHOD != \"ORIGINAL\", na.rm = TRUE)\n", - " n_invalid <- nrow(invalid_coords)\n", - " \n", - " log_msg(glue(\"Coordinate quality check over {n_total_coords} FOSAs: original valid={n_kept_original}, corrected={n_corrected}, invalid={n_invalid}.\"))\n", - " if (n_corrected > 0) {\n", - " log_msg(glue(\"Applied coordinate correction algorithm to {n_corrected} FOSAs (swap/sign/decimal left-to-right, k<=2).\"), \"warning\")\n", - " }\n", - " if (n_invalid > 0) {\n", - " log_msg(glue(\"{n_invalid} FOSAs remain invalid after correction attempts. LONGITUDE/LATITUDE set to NA.\"), \"warning\")\n", - " }\n", - "} else {\n", - " invalid_coords <- c()\n", + "pyramid_data_coords$LONGITUDE <- coord_fix_df$LONGITUDE_FIXED\n", + "pyramid_data_coords$LATITUDE <- coord_fix_df$LATITUDE_FIXED\n", + "\n", + "invalid_coords <- pyramid_data_coords %>%\n", + " bind_cols(coord_fix_df %>% select(LONGITUDE_ORIGINAL, LATITUDE_ORIGINAL, COORD_FIX_METHOD, COORD_IS_VALID)) %>%\n", + " filter(!COORD_IS_VALID & !is.na(LONGITUDE_ORIGINAL) & !is.na(LATITUDE_ORIGINAL)) %>%\n", + " mutate(INVALID_COORD_REASON = \"NO_VALID_TRANSFORMATION_IN_COUNTRY\") %>%\n", + " select(-LONGITUDE, -LATITUDE)\n", + "\n", + "n_total_coords <- sum(!is.na(coord_fix_df$LONGITUDE_ORIGINAL) & !is.na(coord_fix_df$LATITUDE_ORIGINAL))\n", + "n_kept_original <- sum(coord_fix_df$COORD_FIX_METHOD == \"ORIGINAL\", na.rm = TRUE)\n", + "n_corrected <- sum(coord_fix_df$COORD_IS_VALID & coord_fix_df$COORD_FIX_METHOD != \"ORIGINAL\", na.rm = TRUE)\n", + "n_invalid <- nrow(invalid_coords)\n", + "\n", + "log_msg(glue(\"Coordinate quality check over {n_total_coords} FOSAs: original valid={n_kept_original}, corrected={n_corrected}, invalid={n_invalid}.\"))\n", + "if (n_corrected > 0) {\n", + " log_msg(glue(\"Applied coordinate correction algorithm to {n_corrected} FOSAs (swap/sign/decimal left-to-right, k<=2).\"), \"warning\")\n", + "}\n", + "if (n_invalid > 0) {\n", + " log_msg(glue(\"{n_invalid} FOSAs remain invalid after correction attempts. LONGITUDE/LATITUDE set to NA.\"), \"warning\")\n", "}\n", "\n", "head(pyramid_data_coords, 3)" diff --git a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_reporting_rates.ipynb b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_reporting_rates.ipynb index 7ea3560..c30117b 100644 --- a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_reporting_rates.ipynb +++ b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_reporting_rates.ipynb @@ -12,11 +12,15 @@ "cell_type": "code", "execution_count": null, "id": "a619ce26-fc1d-4272-b97b-dd695dbe2d58", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ - "source(file.path(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r\"))\n", - "setup_var <- get_setup_variables(packages=c(\"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\",\"glue\"))\n", + "source(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r\")\n", + "setup_var <- snt_setup(SNT_ROOT_PATH = \"~/workspace\", packages = c(\"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\"))\n", "\n", "# Load config\n", "config_json <- load_snt_config(file.path(setup_var$CONFIG_PATH, \"SNT_config.json\"))\n", @@ -43,7 +47,11 @@ "cell_type": "code", "execution_count": null, "id": "9bef62cf-4cfd-45af-899d-7ba50a028d72", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "# Load file from dataset\n", @@ -72,7 +80,11 @@ "cell_type": "code", "execution_count": null, "id": "a519b90a-dcc2-4bb7-a3c3-3429f40189b7", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "log_msg(glue(\"Start DHIS2 reporting rates formatting.\")) \n", @@ -101,7 +113,6 @@ }, "outputs": [], "source": [ - "# Set administrative columns\n", "adm_1_id_col <- gsub(\"_NAME\", \"_ID\", ADMIN_1)\n", "adm_1_name_col <- ADMIN_1\n", "adm_2_id_col <- gsub(\"_NAME\", \"_ID\", ADMIN_2)\n", @@ -109,19 +120,28 @@ "adm_ou_id_col <- glue(\"LEVEL_{ou_level}_ID\")\n", "adm_ou_name_col <- glue(\"LEVEL_{ou_level}_NAME\")\n", "\n", - "# Administrative columns list\n", - "admin_columns <- c(\n", + "available_cols <- names(dhis2_data_clean)\n", + "dup_cols <- names(dhis2_data_clean)[duplicated(names(dhis2_data_clean))]\n", + "if (length(dup_cols) > 0) {\n", + " stop(glue(\"[ERROR] Duplicate column names in reporting data: {paste(unique(dup_cols), collapse=', ')}\"))\n", + "}\n", + "\n", + "admin_columns <- unique(c(\n", " adm_1_id_col,\n", " adm_1_name_col,\n", " adm_2_id_col,\n", " adm_2_name_col,\n", " adm_ou_id_col,\n", " adm_ou_name_col\n", - ") \n", + "))\n", + "\n", + "missing_admin <- setdiff(admin_columns, available_cols)\n", + "if (length(missing_admin) > 0) {\n", + " stop(glue(\"[ERROR] Missing required admin columns in reporting rates data: {paste(missing_admin, collapse=', ')}\"))\n", + "}\n", "\n", - "# Select relevant columns for SNT\n", "fixed_cols <- c(\"PE\", \"VALUE\", \"PRODUCT_UID\", \"PRODUCT_NAME\", \"PRODUCT_METRIC\")\n", - "selected_cols <- c(fixed_cols, admin_columns)\n", + "selected_cols <- unique(c(fixed_cols, admin_columns))\n", "dhis2_data_selection <- dhis2_data_clean[selected_cols]\n", "\n", "print(dim(dhis2_data_selection))\n", diff --git a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_routine.ipynb b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_routine.ipynb index e0fbf0f..0522f73 100644 --- a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_routine.ipynb +++ b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_routine.ipynb @@ -11,19 +11,16 @@ { "cell_type": "code", "execution_count": null, - "id": "687392e7-fe6c-4355-9f4d-6718b467a33d", + "id": "46002c70", "metadata": { - "tags": [ - "parameters" - ], "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "source(file.path(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r\"))\n", - "setup_var <- get_setup_variables(packages=c(\"lubridate\", \"zoo\", \"glue\", \"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"reticulate\"))\n", + "source(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r\")\n", + "setup_var <- snt_setup(SNT_ROOT_PATH = \"~/workspace\", packages = c(\"lubridate\", \"zoo\", \"glue\", \"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"reticulate\"))\n", "config_json <- load_snt_config(file.path(setup_var$CONFIG_PATH, \"SNT_config.json\"))\n", "\n", "# Save this country code in a variable\n", @@ -103,7 +100,11 @@ "cell_type": "code", "execution_count": null, "id": "eea23e94-1118-43c7-9ee4-73aa8ac2adb2", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "# Max admin columns available (matchin ou)\n", @@ -167,7 +168,11 @@ "cell_type": "code", "execution_count": null, "id": "7ff54ba9-888b-4301-aa0a-a813e5259b95", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "dhis_indicator_definitions <- config_json$DHIS2_DATA$DHIS2_INDICATOR_DEFINITIONS\n", @@ -180,7 +185,11 @@ "cell_type": "code", "execution_count": null, "id": "89f49b04-245e-4b24-bb38-25800a05bd2d", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "# Select the list of valid indicators from the definitions: \n", @@ -204,7 +213,11 @@ "cell_type": "code", "execution_count": null, "id": "16ec64ae-9537-470c-84bb-0e190e799117", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "# build SNT indicators \n", diff --git a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_shapes.ipynb b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_shapes.ipynb index a9d0378..0465241 100644 --- a/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_shapes.ipynb +++ b/pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_shapes.ipynb @@ -12,11 +12,15 @@ "cell_type": "code", "execution_count": null, "id": "b3d71304-8153-4980-b805-dfabea3a9737", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ - "source(file.path(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r\"))\n", - "setup_var <- get_setup_variables(packages=c(\"lubridate\", \"zoo\", \"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"sf\", \"rmapshaper\", \"glue\"))\n", + "source(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r\")\n", + "setup_var <- snt_setup(SNT_ROOT_PATH = \"~/workspace\", packages = c(\"lubridate\", \"zoo\", \"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"sf\", \"rmapshaper\", \"glue\"))\n", "\n", "# Load config\n", "config_json <- load_snt_config(file.path(setup_var$CONFIG_PATH, \"SNT_config.json\"))\n", @@ -48,7 +52,11 @@ "cell_type": "code", "execution_count": null, "id": "26d681e9-20b6-4efe-a8bc-c78463f43a2d", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "dhis2_data <- load_dataset_file(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_shapes.parquet\"))\n", @@ -75,7 +83,11 @@ "cell_type": "code", "execution_count": null, "id": "0b0e688e-2a1d-451c-8578-e765c5fef3fb", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "# Set administrative column names\n", @@ -120,7 +132,11 @@ "cell_type": "code", "execution_count": null, "id": "921f4bec-bc61-46b3-b897-9efce70f0260", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "# Data selection\n", @@ -164,7 +180,11 @@ "cell_type": "code", "execution_count": null, "id": "1ce19ce4-49ff-4e9d-b91f-b73864076b53", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "# Safe Convert geometry column from GeoJSON to 'sf' (simple feature geometry)\n", @@ -184,7 +204,11 @@ "cell_type": "code", "execution_count": null, "id": "6ddfe3e7-bdd1-4078-ac5b-dda01b8ea77b", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "# Validation check\n", diff --git a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb index 1b7d1c6..d73a6d6 100644 --- a/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb +++ b/pipelines/snt_dhis2_formatting/reporting/snt_dhis2_formatting_report.ipynb @@ -1,1386 +1,477 @@ { - "cells": [ - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# 💡 Comments / Questions & To Do's:\n", - "# - filter by YEAR keep only 2022-2024): \n", - "# 1. Why these years? Arbitrary choice? Based on what? linked to what?\n", - "# 2. Is this a paramater is some other pipeline? if so, should be integrated here somehow \n", - "# - Missing data: why do we have NA values for population? Are these real NA (missing data) or 0?\n", - "# - OUTLIERS: there are clear outliers (i.e., DS AGADEZ): shall we do some simple data cleaning here?\n", - "# - Population catagories (breaks) do we have a specific scale in mind \n", - "# (i.e., use same as another country) or can I set it based on the data" - ], - "execution_count": null, - "outputs": [], - "id": "47551f88-b40b-449f-9dc1-59db71183611" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# TO DO / FINISH:\n", - "# - add safety \"if\" logic so nb does not fail if data is missing or wrong path ...\n", - "# - (maybe) also add meaningful messages\n", - "# - Add code to export PNG files of relevant figures\n", - "# - Set dynamic boundaries for POPULATION categories? (so can use same code in different countries)\n", - "# - Clean code to avoid redundancies (especially ggplot stuff, a lot of copy pasted ...)" - ], - "execution_count": null, - "outputs": [], - "id": "342b6b54-4812-4b07-b408-68a034b4014e" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0. Paths and Config" - ], - "id": "5b72f828-4fc1-462d-babc-f8f6c9c96ff5" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Set SNT Paths\n", - "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_formatting\")\n", - "\n", - "REPORTING_NB_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_formatting/reporting\")\n", - "\n", - "# Create output directories if they don't exist (before loading utils)\n", - "figures_dir <- file.path(REPORTING_NB_PATH, \"outputs\", \"figures\")\n", - "if (!dir.exists(figures_dir)) {\n", - " dir.create(figures_dir, recursive = TRUE)\n", - " print(paste0(\"Created figures directory: \", figures_dir))\n", - "}" - ], - "execution_count": null, - "outputs": [], - "id": "7d3285c7-1a60-46ad-9541-36a703d51924" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_formatting_report.r\"))" - ], - "execution_count": null, - "outputs": [], - "id": "732733e7-8890-4c3e-be64-496fd4a2c800" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "required_packages <- c(\n", - " \"tidyverse\", \n", - " \"arrow\", \n", - " \"sf\", \n", - " \"reticulate\",\n", - " \"patchwork\"\n", - ") \n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ], - "execution_count": null, - "outputs": [], - "id": "3f26728d-10a0-42d6-a7ff-368cc38e60b9" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load openhexa.sdk\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ], - "execution_count": null, - "outputs": [], - "id": "20475dd9-5091-4f87-9ae2-d0235921fe94" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })" - ], - "execution_count": null, - "outputs": [], - "id": "9f70d726-1c34-47dc-b963-bb23e42994bb" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Configuration variables\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME\n", - "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" - ], - "execution_count": null, - "outputs": [], - "id": "90d58c60-fb4e-40e4-add8-5f258f541843" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [], - "execution_count": null, - "outputs": [], - "id": "4b96fa16-25cc-4420-9ad8-332af4a59fdf" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# printdim() loaded from pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r" - ], - "execution_count": null, - "outputs": [], - "id": "8eece9e0-2544-48c1-8579-a5a721af4ff8" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Import data" - ], - "id": "643abe28-da3b-4bd2-9ecc-126b18b85c69" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# import analytics DHIS2 data\n", - "routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste0(\"[WARNING] Error while loading DHIS2 Routine data for: \" , COUNTRY_CODE, \n", - " \" the report cannot be executed. [ERROR DETAILS] \", conditionMessage(e))\n", - " stop(msg)\n", - " })\n", - "\n", - "printdim(routine_data)" - ], - "execution_count": null, - "outputs": [], - "id": "43bbbcdf-c1d1-4631-980c-2c4465cf7a55" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "population_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_population.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste0(COUNTRY_NAME , \" Population data is not available in dataset : \" , dataset_name, \" last version.\")\n", - " log_msg(msg, \"warning\")\n", - " population_data <- NULL\n", - " })\n", - "\n", - "printdim(population_data)" - ], - "execution_count": null, - "outputs": [], - "id": "d53274c5-965e-4a11-bb77-c9b899d5cb9c" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) { \n", - " msg <- paste0(COUNTRY_NAME , \" Shapes data is not available in dataset : \" , dataset_name, \" last version.\")\n", - " log_msg(msg, \"warning\")\n", - " shapes_data <- NULL\n", - " })\n", - "\n", - "printdim(shapes_data)" - ], - "execution_count": null, - "outputs": [], - "id": "c1be5372-cbc1-4343-ab11-01eae0fa9d60" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [], - "id": "c881f748-e391-46c9-a36a-ed11c238a6ce" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [], - "execution_count": null, - "outputs": [], - "id": "65ea60f5-99e9-46d1-89f0-03245d9efd0b" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# **Complétude des indicateurs composites**\n" - ], - "id": "e3d5b582-a38f-4ce0-a9a2-9a53ab5eb233" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Complétude du rapportage des indicateurs composites / Reporting Completeness of Composite Indicators" - ], - "id": "ca84fce1-0407-433a-a98a-e65ed15ab8de" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "head(routine_data)" - ], - "execution_count": null, - "outputs": [], - "id": "c7691e61-6542-4d40-af2a-c018d29b86a8" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1.1 Proportion de formations sanitaires ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur" - ], - "id": "c109e82d-8c72-41f0-857a-322163cf213e" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Step 0: Rename your data for convenience\n", - "data <- routine_data\n", - "\n", - "# Step 1: Convert PERIOD to DATE\n", - "data <- data %>%\n", - " mutate(\n", - " DATE = ymd(paste0(PERIOD, \"01\"))\n", - " )\n", - "\n", - "# Step 2: Reshape wide to long: INDICATOR = column name (e.g., CONF), VALUE = value\n", - "indicator_vars <- setdiff(names(data), c(\n", - " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"DATE\"\n", - "))\n", - "\n", - "long_data <- data %>%\n", - " pivot_longer(cols = any_of(indicator_vars),\n", - " names_to = \"INDICATOR\",\n", - " values_to = \"VALUE\") %>%\n", - " rename(OU = OU_ID)\n", - "\n", - "# Step 3: Build expected full grid (OU × INDICATOR × DATE)\n", - "full_grid <- expand_grid(\n", - " OU = unique(long_data$OU),\n", - " INDICATOR = unique(long_data$INDICATOR),\n", - " DATE = unique(long_data$DATE)\n", - ")\n", - "\n", - "# Step 4: Join and assess reporting status\n", - "reporting_check <- full_grid %>%\n", - " left_join(\n", - " long_data %>% select(OU, INDICATOR, DATE, VALUE),\n", - " by = c(\"OU\", \"INDICATOR\", \"DATE\")\n", - " ) %>%\n", - " mutate(\n", - " is_missing = is.na(VALUE),\n", - " is_zero = VALUE == 0 & !is.na(VALUE),\n", - " is_positive = VALUE > 0 & !is.na(VALUE)\n", - " )\n", - "\n", - "# Step 5: Summarise reporting status\n", - "reporting_summary <- reporting_check %>%\n", - " group_by(INDICATOR, DATE) %>%\n", - " summarise(\n", - " n_total = n_distinct(OU),\n", - " n_missing = sum(is_missing),\n", - " n_zero = sum(is_zero),\n", - " n_positive = sum(is_positive),\n", - " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", - " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", - " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 6: Prepare plot-ready data\n", - "plot_data <- reporting_summary %>%\n", - " pivot_longer(\n", - " cols = starts_with(\"pct_\"),\n", - " names_to = \"Status\",\n", - " values_to = \"Percentage\"\n", - " ) %>%\n", - " mutate(\n", - " Status = recode(Status,\n", - " pct_missing = \"Valeur manquante\",\n", - " pct_zero = \"Valeur nulle rapportée\",\n", - " pct_positive = \"Valeur positive rapportée\")\n", - " ) %>%\n", - " complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))\n", - "" - ], - "execution_count": null, - "outputs": [], - "id": "0f54505a-2dcc-429e-a900-46d4fae6fd31" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "options(repr.plot.width = 17, repr.plot.height = 10)\n", - "ggplot(plot_data, aes(x = DATE, y = Percentage, fill = Status)) +\n", - " geom_col(position = \"stack\") +\n", - " facet_wrap(~ INDICATOR, scales = \"free_y\", ncol = 4) +\n", - " scale_y_continuous() +\n", - " scale_fill_manual(values = c(\n", - " \"Valeur manquante\" = \"tomato\",\n", - " \"Valeur nulle rapportée\" = \"skyblue\",\n", - " \"Valeur positive rapportée\" = \"green\"\n", - " )) +\n", - " labs(\n", - " title = \"Taux de rapportage par indicateur (niveau formation sanitaire)\",\n", - " subtitle = \"Proportion des valeurs rapportées par mois et par indicateur\",\n", - " x = \"Mois\", y = \"% des formations sanitaires\",\n", - " fill = \"Statut du rapportage\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 20),\n", - " strip.text = element_text(size = 16),\n", - " axis.title = element_text(size = 16),\n", - " axis.text = element_text(size = 16)\n", - " )\n" - ], - "execution_count": null, - "outputs": [], - "id": "cfd115e7-176d-4beb-9ab9-2e6990cb16af" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1.2 Proportion des districts ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur." - ], - "id": "e6871759-714b-437a-8b9c-5a5a06656567" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Step 0: Rename for convenience\n", - "data <- routine_data\n", - "\n", - "# Step 1: Convert PERIOD to proper Date\n", - "data <- data %>%\n", - " mutate(Date = ymd(paste0(PERIOD, \"01\")))\n", - "\n", - "# Step 2: Identify indicator columns\n", - "indicator_cols <- setdiff(names(data), c(\n", - " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\",\n", - " \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"Date\"\n", - "))\n", - "\n", - "# Step 3: Reshape to long format\n", - "data_long <- data %>%\n", - " select(ADM2_ID, OU_ID, Date, any_of(indicator_cols)) %>%\n", - " pivot_longer(cols = any_of(indicator_cols),\n", - " names_to = \"Indicator\", values_to = \"value\") %>%\n", - " mutate(value = as.numeric(value))\n", - "\n", - "# Step 4: Full expected grid at ADM2 level\n", - "full_grid <- expand_grid(\n", - " ADM2_ID = unique(data_long$ADM2_ID),\n", - " Indicator = unique(data_long$Indicator),\n", - " Date = unique(data_long$Date)\n", - ")\n", - "\n", - "# Step 5: Detect if *any* health facility reported per district × indicator × date\n", - "reporting_check <- data_long %>%\n", - " group_by(ADM2_ID, Indicator, Date) %>%\n", - " summarise(\n", - " is_missing = all(is.na(value)),\n", - " is_zero = all(value == 0, na.rm = TRUE),\n", - " is_positive = any(value > 0, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 6: Join with full grid to fill in missing ADM2s\n", - "reporting_full <- full_grid %>%\n", - " left_join(reporting_check, by = c(\"ADM2_ID\", \"Indicator\", \"Date\")) %>%\n", - " mutate(\n", - " is_missing = replace_na(is_missing, TRUE),\n", - " is_zero = replace_na(is_zero, FALSE),\n", - " is_positive = replace_na(is_positive, FALSE)\n", - " )\n", - "\n", - "# Step 7: Summarise by Indicator and Date\n", - "reporting_summary <- reporting_full %>%\n", - " group_by(Indicator, Date) %>%\n", - " summarise(\n", - " n_total = n_distinct(ADM2_ID),\n", - " n_missing = sum(is_missing),\n", - " n_zero = sum(is_zero & !is_missing),\n", - " n_positive = sum(is_positive),\n", - " pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),\n", - " pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),\n", - " pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 8: Reshape for plotting\n", - "plot_data <- reporting_summary %>%\n", - " pivot_longer(cols = starts_with(\"pct_\"),\n", - " names_to = \"Status\", values_to = \"Percentage\") %>%\n", - " mutate(Status = recode(Status,\n", - " pct_missing = \"Valeur manquante\",\n", - " pct_zero = \"Valeur nulle rapportée\",\n", - " pct_positive = \"Valeur positive rapportée\")) %>%\n", - " complete(Indicator, Date, Status, fill = list(Percentage = 0))\n", - "\n", - "# Step 9: Plot\n", - "ggplot(plot_data, aes(x = Date, y = Percentage, fill = Status)) +\n", - " geom_col(position = \"stack\") +\n", - " facet_wrap(~ Indicator, scales = \"free_y\") +\n", - " scale_y_continuous(limits = c(0, 100)) +\n", - " scale_fill_manual(values = c(\n", - " \"Valeur manquante\" = \"tomato\",\n", - " \"Valeur nulle rapportée\" = \"skyblue\",\n", - " \"Valeur positive rapportée\" = \"green\"\n", - " )) +\n", - " labs(\n", - " title = \"Taux de rapportage par indicateur (niveau district)\",\n", - " subtitle = \"Proportion des districts (ADM2_ID) rapportant chaque mois\",\n", - " x = \"Mois\", y = \"% des districts\",\n", - " fill = \"Statut du rapportage\"\n", - " ) +\n", - " theme_minimal(base_size = 14) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 18),\n", - " strip.text = element_text(size = 14),\n", - " axis.title = element_text(size = 14),\n", - " axis.text = element_text(size = 12)\n", - " )\n", - "" - ], - "execution_count": null, - "outputs": [], - "id": "c89f6c77-dd42-4616-8eb5-1642d5b51157" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Cohérence interne des indicateurs composites" - ], - "id": "5cda3985" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2.1 Filtrage préliminaire des valeurs aberrantes pour l’analyse de cohérence\n", - "\n", - "Avant d’évaluer la cohérence entre les indicateurs composites, nous éliminons d’abord les valeurs aberrantes les plus extrêmes. Cette étape ne modifie pas définitivement le jeu de données et ne vise pas à détecter toutes les valeurs aberrantes ; elle permet simplement d’exclure les cas extrêmes afin de faciliter une évaluation plus fiable de la cohérence entre les indicateurs." - ], - "id": "c131a633" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# detect_mad_outliers() loaded from utils/snt_dhis2_formatting_report.r" - ], - "execution_count": null, - "outputs": [], - "id": "936268f4" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Step 0: Select relevant core indicators\n", - "target_indicators <- c(\"SUSP\", \"TEST\", \"CONF\", \"MALTREAT\", \"PRES\")\n", - "\n", - "# Step 1: Convert wide to long format\n", - "routine_long <- routine_data %>%\n", - " pivot_longer(\n", - " cols = any_of(target_indicators),\n", - " names_to = \"indicator\",\n", - " values_to = \"value\"\n", - " ) %>%\n", - " mutate(\n", - " PERIOD = as.character(PERIOD), # Ensure PERIOD is character for join\n", - " OU = OU_ID # Alias for join clarity\n", - " )\n", - "\n", - "# Step 2: Filter to indicators of interest\n", - "routine_long_filtered <- routine_long %>%\n", - " filter(indicator %in% target_indicators)\n", - "\n", - "# Step 3: Calculate MAD15\n", - "mad15_data <- detect_mad_outliers(\n", - " routine_long_filtered,\n", - " deviation = 15,\n", - " outlier_column = \"mad15\"\n", - ")\n", - "\n", - "# Step 4: Calculate MAD10 (only where mad15 not flagged or missing)\n", - "mad10_flags <- mad15_data %>%\n", - " filter(is.na(mad15) | mad15 == FALSE, !is.na(value)) %>%\n", - " detect_mad_outliers(deviation = 10, outlier_column = \"mad10\")\n", - "\n", - "# Step 5: Combine MAD15 and MAD10 results\n", - "mad_combined <- mad15_data %>%\n", - " left_join(\n", - " mad10_flags %>% select(PERIOD, OU, indicator, mad10),\n", - " by = c(\"PERIOD\", \"OU\", \"indicator\")\n", - " )" - ], - "execution_count": null, - "outputs": [], - "id": "881f9625" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Step 6: Identify outliers (MAD15 or MAD10 flagged as TRUE)\n", - "outlier_flags <- mad_combined %>%\n", - " filter(mad15 == TRUE | mad10 == TRUE) %>%\n", - " mutate(PERIOD = as.numeric(PERIOD)) %>%\n", - " select(PERIOD, OU, indicator)\n", - "\n", - "# Step 7: Reshape routine_data to long format for filtering\n", - "routine_long_all <- routine_data %>%\n", - " pivot_longer(\n", - " cols = any_of(target_indicators),\n", - " names_to = \"indicator\",\n", - " values_to = \"value\"\n", - " ) %>%\n", - " mutate(OU = OU_ID)\n", - "\n", - "# Step 8: Remove outliers\n", - "routine_long_clean <- routine_long_all %>%\n", - " anti_join(outlier_flags, by = c(\"PERIOD\", \"OU\", \"indicator\"))\n", - "\n", - "# Step 9: Reshape back to wide format if needed\n", - "routine_data_clean <- routine_long_clean %>%\n", - " select(-OU) %>%\n", - " pivot_wider(names_from = indicator, values_from = value)\n", - "" - ], - "execution_count": null, - "outputs": [], - "id": "04d41ed1" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2.2 Cohérence des indicateurs" - ], - "id": "c6a5a77b" - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "routine_data_clean <- routine_data_clean %>%\n", - " mutate(across(where(is.list), ~ map_dbl(., ~ sum(as.numeric(.x), na.rm = TRUE))))\n", - "\n", - "# Now head() should work\n", - "head(routine_data_clean, 3)" - ], - "execution_count": null, - "outputs": [], - "id": "aa5f6333-1ed7-4fad-b418-5a9f61b5af22" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Step 1: Extract year and month from PERIOD\n", - "routine_hd_month <- routine_data_clean %>%\n", - " mutate(\n", - " YEAR = substr(PERIOD, 1, 4),\n", - " MONTH = substr(PERIOD, 5, 6)\n", - " ) %>%\n", - " group_by(ADM2_ID, YEAR, MONTH) %>%\n", - " summarise(\n", - " SUSP = sum(SUSP, na.rm = TRUE),\n", - " TEST = sum(TEST, na.rm = TRUE),\n", - " CONF = sum(CONF, na.rm = TRUE),\n", - " MALTREAT = sum(MALTREAT, na.rm = TRUE),\n", - " PRES = sum(PRES, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 2: Create scatter plots\n", - "options(repr.plot.width = 14, repr.plot.height = 6)\n", - "\n", - "p1 <- ggplot(routine_hd_month, aes(x = SUSP, y = TEST)) +\n", - " geom_point(alpha = 0.5, color = \"blue\") +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", - " labs(title = \"Suspectés vs Testés\", x = \"Cas suspectés\", y = \"Cas testés\") +\n", - " theme_minimal(base_size = 16)\n", - "\n", - "p2 <- ggplot(routine_hd_month, aes(x = TEST, y = CONF)) +\n", - " geom_point(alpha = 0.5, color = \"darkgreen\") +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", - " labs(title = \"Testés vs Confirmés\", x = \"Cas testés\", y = \"Cas confirmés\") +\n", - " theme_minimal(base_size = 16)\n", - "\n", - "p3 <- ggplot(routine_hd_month, aes(x = CONF, y = MALTREAT)) +\n", - " geom_point(alpha = 0.5, color = \"purple\") +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", - " labs(title = \"Confirmés vs Traités\", x = \"Cas confirmés\", y = \"Cas traités\") +\n", - " theme_minimal(base_size = 16)\n", - "\n", - "# Step 3: Combine plots\n", - "(p1 | p2 | p3) + plot_layout(guides = \"collect\")\n" - ], - "execution_count": null, - "outputs": [], - "id": "6cfeb18e" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Step 1: Aggregate monthly values\n", - "rds_clean_month <- routine_data_clean %>%\n", - " mutate(\n", - " YEAR = substr(PERIOD, 1, 4),\n", - " MONTH = substr(PERIOD, 5, 6),\n", - " DATE = as.Date(paste(YEAR, MONTH, \"01\", sep = \"-\"))\n", - " ) %>%\n", - " group_by(YEAR, MONTH, DATE) %>%\n", - " summarise(\n", - " SUSP = sum(SUSP, na.rm = TRUE),\n", - " TEST = sum(TEST, na.rm = TRUE),\n", - " CONF = sum(CONF, na.rm = TRUE),\n", - " PRES = sum(PRES, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "# Step 2: Plot monthly national trends\n", - "options(repr.plot.width = 14, repr.plot.height = 6)\n", - "rds_clean_month %>%\n", - " pivot_longer(cols = c(SUSP, TEST, CONF, PRES), names_to = \"Indicator\") %>%\n", - " ggplot(aes(x = DATE, y = value, color = Indicator)) +\n", - " geom_line(linewidth = 1.2) +\n", - " labs(\n", - " title = \"Tendances mensuelles nationales des indicateurs composites (après suppression des outliers)\",\n", - " x = \"Mois\", y = \"Nombre de cas\", color = \"Indicateur\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 20),\n", - " axis.title = element_text(size = 16),\n", - " axis.text = element_text(size = 16),\n", - " legend.title = element_text(size = 16),\n", - " legend.text = element_text(size = 16)\n", - " )\n" - ], - "execution_count": null, - "outputs": [], - "id": "0df24272" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Carte des populations par district sanitaire (DS)" - ], - "id": "780fc9f8-6c67-4328-85f1-6bdefcd15b48" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3.1. Carte de la Population pour ADM2 " - ], - "id": "da58bbd3" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Code from previous version of the notebook\n", - "# Uses continuos scale for population\n", - "\n", - "# Run if population_data is available\n", - "if (!is.null(population_data) & !is.null(shapes_data)) {\n", - " # Join population to spatial shapes\n", - " map_data <- shapes_data %>%\n", - " left_join(population_data, by = \"ADM2_ID\")\n", - " \n", - " # Plot population per district (DS)\n", - " plot <- ggplot(map_data) +\n", - " geom_sf(aes(fill = POPULATION), color = \"white\", size = 0.2) +\n", - " scale_fill_viridis_c(option = \"C\", name = \"Population\") +\n", - " labs(\n", - " title = \"Population totale par district sanitaire (DS)\",\n", - " subtitle = \"Données DHIS2\",\n", - " caption = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " theme_minimal(base_size = 14) \n", - "\n", - " print(plot)\n", - "\n", - "} else {\n", - " print(\"Population or shapes data not available.\")\n", - "}\n" - ], - "execution_count": null, - "outputs": [], - "id": "6965155d" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ⚠️ 3.2. Carte de la Population Désagrégée (spécifique au pays)\n", - "Le code suivant est spécifique à chaque pays et repose sur une population désagrégée. " - ], - "id": "eb276692" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "population_data_filtered <- population_data\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " print(\"🇳🇪 Executing NER specific code ... \")\n", - "\n", - " IRdisplay::display_markdown(\"\n", - " ### 🇳🇪 NER specific code \n", - " Made ad hoc to allow comparison with data from other or previous analyses. Namely:\n", - " * only year 2022 to 2024\n", - " * specific palette (yellowish to brick red)\n", - " * specific intervals\n", - " * looks at **disaggregated** population <- this is sometimes contry-specific!\n", - "\")\n", - "\n", - " # --- Filter data to keep only 2022-2024 ... ---\n", - " years_to_keep <- 2022:2024\n", - " population_data_filtered <- population_data |> filter(YEAR %in% years_to_keep)\n", - "\n", - " # --- Read data from SNT_metadata.json ---\n", - " metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading metadata\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - " # --- Assign population breaks from metadata ---\n", - " value_breaks_tot <- jsonlite::fromJSON(metadata_json$POPULATION_TOTAL$SCALE)\n", - " value_breaks_u5 <- jsonlite::fromJSON(metadata_json$POPULATION_U5$SCALE)\n", - " value_breaks_fe <- jsonlite::fromJSON(metadata_json$POPULATION_PREGNANT$SCALE)\n", - "\n", - " # --- Create dynamic labels based on breaks ---\n", - " labels_tot <- create_dynamic_labels(value_breaks_tot)\n", - " labels_u5 <- create_dynamic_labels(value_breaks_u5)\n", - " labels_fe <- create_dynamic_labels(value_breaks_fe)\n", - "\n", - "}" - ], - "execution_count": null, - "outputs": [], - "id": "4d33724e" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "NER_palette_population <- c(\n", - " \"1\" = \"#fae6db\",\n", - " \"2\" = \"#f1b195\",\n", - " \"3\" = \"#ea7354\",\n", - " \"4\" = \"#cc3f32\",\n", - " \"5\" = \"#972620\"\n", - ")\n" - ], - "execution_count": null, - "outputs": [], - "id": "0fdb96a0-873d-4f85-9c34-23c89c204c30" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Population Totales" - ], - "id": "95892df7-e5b8-4d7a-bf96-88673e633370" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - " # IMPORTNAT: palette vector MUST be RENAMED with the (dynamic) descriptive labels\n", - "names(NER_palette_population) <- labels_tot\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION,\n", - " breaks = c(0, value_breaks_tot, Inf),\n", - " labels = labels_tot, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population totale par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_tot, \n", - " drop = FALSE \n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(10, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export to see better in high resolution\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_totals.png\")),\n", - " width = 14,\n", - " height = 8,\n", - " dpi = 300\n", - ")\n", - "}\n" - ], - "execution_count": null, - "outputs": [], - "id": "a0a196b8-2db5-478d-899a-48985d1735f0" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Population Femmes Enceintes (FE)" - ], - "id": "aca477aa-4d93-4a74-ad8c-32a30f85a552" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - "names(NER_palette_population) <- labels_fe\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION_FE,\n", - " breaks = c(0, value_breaks_fe, Inf),\n", - " labels = labels_fe, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population des femmes enceintes par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_fe, \n", - " drop = FALSE # Prevents dropping empty levels from legend\n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(10, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export to see better in high resolution\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_fe.png\")),\n", - " width = 14, \n", - " height = 8,\n", - " dpi = 300\n", - ")\n", - "\n", - "}\n" - ], - "execution_count": null, - "outputs": [], - "id": "9324a56b" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Population Enfants moins de 5 ans (U5)" - ], - "id": "bd5fe86d-591a-4f5a-bc42-58180a413d5d" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "if (COUNTRY_CODE == \"NER\") {\n", - "\n", - "names(NER_palette_population) <- labels_u5\n", - "\n", - "plot <- population_data_filtered %>%\n", - " mutate(\n", - " CATEGORY_POPULATION = cut(\n", - " POPULATION_U5,\n", - " breaks = c(0, value_breaks_u5, Inf),\n", - " labels = labels_u5, \n", - " right = TRUE,\n", - " include.lowest = TRUE\n", - " )\n", - " ) %>% \n", - " left_join(shapes_data, \n", - " by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% \n", - " ggplot() +\n", - " geom_sf(aes(geometry = geometry,\n", - " fill = CATEGORY_POPULATION),\n", - " color = \"black\",\n", - " linewidth = 0.25, \n", - " show.legend = TRUE\n", - " ) +\n", - " labs(\n", - " title = \"Population des enfants de moins de 5 ans par district sanitaire (DS)\",\n", - " subtitle = \"Source: NMDR / DHIS2\"\n", - " ) +\n", - " scale_fill_manual(\n", - " values = NER_palette_population, \n", - " limits = labels_u5, \n", - " drop = FALSE \n", - " ) +\n", - " facet_wrap(~YEAR, ncol = 3) +\n", - " theme_void() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " strip.text = element_text(face = \"bold\"),\n", - " legend.key.height = unit(0.5, \"line\"),\n", - " legend.margin = margin(10, 0, 0, 0)\n", - " )\n", - "\n", - "print(plot)\n", - "\n", - "# Export PNG\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", paste0(COUNTRY_CODE, \"_choropleth_population_u5.png\")),\n", - " width = 14, \n", - " height = 8,\n", - " dpi = 300\n", - ")\n", - "\n", - "}" - ], - "execution_count": null, - "outputs": [], - "id": "4046761f" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3.2. Complétude et qualité des données de la Population" - ], - "id": "61e5ac12-c973-48e0-8c97-1af90e4b59a5" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Population Totale" - ], - "id": "0d86ed4a-e194-496b-9440-ad206157ee17" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# hist(population_data$POPULATION)\n", - "hist(population_data_filtered$POPULATION)" - ], - "execution_count": null, - "outputs": [], - "id": "bec2759d-9ac4-42e1-9f7e-7076780bd7d6" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "ggplot(population_data_filtered) +\n", - " geom_point(aes(x = POPULATION,\n", - " y = fct_reorder(ADM2_NAME, POPULATION),\n", - " color = factor(YEAR))\n", - " ) +\n", - " facet_grid(rows = \"ADM1_NAME\", \n", - " scale = \"free_y\", \n", - " space = \"free_y\", \n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = c(0, 2e+05, 4e+05, 6e+05, 8e+05, 1e+06, 1.5e+06),\n", - " labels = scales::comma) +\n", - " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", - " labs(color = \"Année\") +\n", - " theme_minimal() +\n", - " theme(\n", - " axis.text = element_text(size = 7),\n", - " axis.title.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " panel.grid.minor.x = element_blank(),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "# Export PNG\n", - "ggsave(\n", - " filename = file.path(REPORTING_NB_PATH, \"outputs\", \"figures\", \"hist_population_totale.png\"),\n", - " units = \"cm\",\n", - " width = 15,\n", - " height = 23,\n", - " bg = \"white\"\n", - ")" - ], - "execution_count": null, - "outputs": [], - "id": "bc00527f-d8f9-4c9e-bf4a-326c92cf8a68" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Population Femmes Enceintes (FE)" - ], - "id": "d6ab387a-cc9e-42b9-a634-12af21bef0f5" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Wrap in if statement to avoid errors if POPULATION_FE is missing\n", - "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", - " hist(population_data_filtered$POPULATION_FE)\n", - "}" - ], - "execution_count": null, - "outputs": [], - "id": "c6bb79dd-2d8a-4cd1-bf91-3e0e48c14eda" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "if (\"POPULATION_FE\" %in% names(population_data_filtered)) { \n", - " \n", - "ggplot(population_data_filtered) +\n", - " geom_point(aes(x = POPULATION_FE,\n", - " y = fct_reorder(ADM2_NAME, POPULATION_FE),\n", - " color = factor(YEAR))\n", - " ) +\n", - " facet_grid(rows = \"ADM1_NAME\", \n", - " scale = \"free_y\", \n", - " space = \"free_y\", \n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+05, 1e+06, 1.5e+06),\n", - " labels = scales::comma) +\n", - " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", - " labs(\n", - " # title = \"\"\n", - " color = \"Année\") +\n", - " theme_minimal() +\n", - " theme(\n", - " axis.text = element_text(size = 7),\n", - " axis.title.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " panel.grid.minor.x = element_blank(),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "} " - ], - "execution_count": null, - "outputs": [], - "id": "4200afa2-e2f0-4876-9842-141b96f32fe8" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Population Enfants moins de 5 ans (U5)" - ], - "id": "e39305c0-3700-48c3-967a-b9c6af3e737f" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", - " hist(population_data_filtered$POPULATION_U5)\n", - "}" - ], - "execution_count": null, - "outputs": [], - "id": "bbda9b88-9b91-4845-83a8-795a12124999" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "if (\"POPULATION_U5\" %in% names(population_data_filtered)) {\n", - "\n", - "ggplot(population_data_filtered) +\n", - " geom_point(aes(x = POPULATION_U5,\n", - " y = fct_reorder(ADM2_NAME, POPULATION_U5, .na_rm = FALSE),\n", - " color = factor(YEAR))\n", - " ) +\n", - " facet_grid(rows = \"ADM1_NAME\", \n", - " scale = \"free_y\", \n", - " space = \"free_y\", \n", - " switch = \"y\") +\n", - " scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+04, 1e+05, 1.5e+05),\n", - " labels = scales::comma) +\n", - " scale_color_viridis_d(option = \"mako\", end = 0.8) +\n", - " labs(\n", - " # title = \"\"\n", - " color = \"Année\") +\n", - " theme_minimal() +\n", - " theme(\n", - " axis.text = element_text(size = 7),\n", - " axis.title.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " strip.placement = \"outside\",\n", - " panel.grid.minor.x = element_blank(),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "}" - ], - "execution_count": null, - "outputs": [], - "id": "742116ab-fef7-46ea-8c4b-0aa2a166005d" + "cells": [ + { + "cell_type": "markdown", + "id": "caa953c2-33b0-43da-857f-e8ef15cf728e", + "metadata": {}, + "source": [ + "# DHIS2 formatting — rapport\n", + "\n", + "Complétude HF / district, cohérence (ADM2×mois), cartes population. Setup Papermill, OpenHEXA et chargements : `utils/snt_dhis2_formatting_report.r` (fonctions `formatting_report_*`) ; logique DHIS2 partagée : `utils/snt_dhis2_formatting.r`.\n", + "\n", + "**Exécution :** Jupyter / Papermill.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a683cfc9-c0ae-49c3-b4cf-04e8da956fd6", + "metadata": { + "tags": [ + "parameters" + ], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Papermill\n", + "REPORT_MAX_YEARS <- 5L\n", + "REPORT_PLOT_MONTHS <- 36L\n", + "REPORT_MAX_INDICATORS <- 40L\n", + "REPORT_POP_SCATTER_MAX_ROWS <- 4000L\n", + "REPORT_SHAPE_SIMPLIFY_TOL <- 0.002\n", + "REPORT_FIG_DPI <- 120L\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3293ed26-9f29-4d2e-a1ad-75b7759a539c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "source(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r\")\n", + "# Papermill: REPORT_* (cellule parameters) sont lus par snt_setup_report(); le reste du notebook : chemins, pays et paramètres via `setup$` uniquement.\n", + "setup <- snt_setup_report()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d685d669-b627-421d-a8ef-563252edb3b2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "message(paste0(\"Formatting report setup ready for \", setup$COUNTRY_CODE, \".\"))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be0159f9-6e12-48cc-8068-5878575888de", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Taille des figures dans Jupyter (IRKernel), optionnel\n", + "options(repr.plot.width = 20, repr.plot.height = 12, repr.plot.res = 120)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f988ad6-f263-40a4-ab3c-5c829e74dfb7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "str(setup$config_json$SNT_CONFIG)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "907117b3-1505-4da8-94af-db03023a4394", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "routine_data <- formatting_report_load_routine_data(setup)\n", + "population_data <- formatting_report_load_population_data(setup)\n", + "shapes_data <- formatting_report_load_shapes_data(setup)\n", + "shapes_data <- formatting_report_simplify_shapes(shapes_data, setup$REPORT_SHAPE_SIMPLIFY_TOL)\n" + ] + }, + { + "cell_type": "markdown", + "id": "0875fcb2-923d-4381-8733-4311bf508908", + "metadata": {}, + "source": [ + "## 1. Complétude (résumé HF / district) — fenêtre récente + indicateurs plafonnés" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a176a68e-3979-41c8-ade3-227ef4c13d8d", + "metadata": { + "vscode": { + "languageId": "r" } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" + }, + "outputs": [], + "source": [ + "data <- routine_data %>% dplyr::mutate(DATE = lubridate::ymd(paste0(PERIOD, \"01\")))\n", + "indicator_vars <- setdiff(names(data), c(\n", + " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"DATE\"\n", + "))\n", + "if (length(indicator_vars) == 0L) {\n", + " stop(\"Aucune colonne indicateur après setdiff pour la section 1.1.\")\n", + "} else {\n", + "long_data <- data %>%\n", + " tidyr::pivot_longer(dplyr::any_of(indicator_vars), names_to = \"INDICATOR\", values_to = \"VALUE\") %>%\n", + " dplyr::rename(OU = OU_ID)\n", + "long_data <- report_limit_long_indicators(long_data, \"INDICATOR\", setup$REPORT_MAX_INDICATORS)\n", + "reporting_summary <- reporting_summary_facility_chunked(long_data)\n", + "report_release_objects(\"long_data\", \"data\", \"indicator_vars\", full_gc = TRUE)\n", + "plot_data <- reporting_summary %>%\n", + " tidyr::pivot_longer(dplyr::starts_with(\"pct_\"), names_to = \"Status\", values_to = \"Percentage\") %>%\n", + " dplyr::mutate(Status = dplyr::recode(Status, pct_missing = \"Valeur manquante\",\n", + " pct_zero = \"Valeur nulle rapportée\", pct_positive = \"Valeur positive rapportée\")) %>%\n", + " tidyr::complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))\n", + "plot_data <- report_filter_recent_months(plot_data, \"DATE\", setup$REPORT_PLOT_MONTHS)\n", + "run_report_plot(\"1.1 complétude HF\", function() {\n", + " if (skip_heavy_plot_input(plot_data, \"1.1 plot_data\")) return(invisible(NULL))\n", + " ncol_use <- 4L\n", + " n_ind <- dplyr::n_distinct(plot_data$INDICATOR)\n", + " n_row <- as.integer(ceiling(n_ind / ncol_use))\n", + " repr_h <- max(12, min(36, 6 + 1.35 * n_row))\n", + " options(repr.plot.width = 20, repr.plot.height = repr_h, repr.plot.res = 120)\n", + " p <- ggplot2::ggplot(plot_data, ggplot2::aes(x = DATE, y = Percentage, fill = Status)) +\n", + " ggplot2::geom_col(position = \"stack\") +\n", + " ggplot2::facet_wrap(~ INDICATOR, scales = \"free_y\", ncol = 4) +\n", + " ggplot2::scale_y_continuous() +\n", + " ggplot2::scale_fill_manual(values = c(\n", + " \"Valeur manquante\" = \"tomato\", \"Valeur nulle rapportée\" = \"skyblue\",\n", + " \"Valeur positive rapportée\" = \"green\"\n", + " )) +\n", + " ggplot2::labs(\n", + " title = \"Taux de rapportage par indicateur (niveau formation sanitaire)\",\n", + " subtitle = \"Proportion des valeurs rapportées par mois et par indicateur\",\n", + " x = \"Mois\", y = \"% des formations sanitaires\", fill = \"Statut du rapportage\"\n", + " ) +\n", + " ggplot2::theme_minimal(base_size = 16) +\n", + " ggplot2::theme(\n", + " plot.title = ggplot2::element_text(face = \"bold\", size = 20),\n", + " strip.text = ggplot2::element_text(size = 16),\n", + " axis.title = ggplot2::element_text(size = 16),\n", + " axis.text = ggplot2::element_text(size = 16)\n", + " )\n", + " print(p)\n", + "})\n", + "report_release_objects(\"plot_data\", \"reporting_summary\", full_gc = TRUE)\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51672f8e-47c4-4622-8988-cb0e3b8d6bad", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "data <- routine_data %>% dplyr::mutate(Date = lubridate::ymd(paste0(PERIOD, \"01\")))\n", + "indicator_cols <- setdiff(names(data), c(\n", + " \"PERIOD\", \"YEAR\", \"MONTH\", \"OU_ID\", \"OU_NAME\", \"ADM1_NAME\", \"ADM1_ID\", \"ADM2_NAME\", \"ADM2_ID\", \"Date\"\n", + "))\n", + "if (length(indicator_cols) == 0L) {\n", + " stop(\"Aucune colonne indicateur pour la section 1.2.\")\n", + "} else {\n", + "if (length(indicator_cols) > setup$REPORT_MAX_INDICATORS) indicator_cols <- indicator_cols[seq_len(setup$REPORT_MAX_INDICATORS)]\n", + "data_long <- data %>%\n", + " dplyr::select(ADM2_ID, OU_ID, Date, dplyr::any_of(indicator_cols)) %>%\n", + " tidyr::pivot_longer(dplyr::any_of(indicator_cols), names_to = \"Indicator\", values_to = \"value\") %>%\n", + " dplyr::mutate(value = as.numeric(value))\n", + "reporting_summary <- reporting_summary_adm2_chunked(data_long)\n", + "plot_data <- reporting_summary %>%\n", + " tidyr::pivot_longer(dplyr::starts_with(\"pct_\"), names_to = \"Status\", values_to = \"Percentage\") %>%\n", + " dplyr::mutate(Status = dplyr::recode(Status, pct_missing = \"Valeur manquante\",\n", + " pct_zero = \"Valeur nulle rapportée\", pct_positive = \"Valeur positive rapportée\")) %>%\n", + " tidyr::complete(Indicator, Date, Status, fill = list(Percentage = 0))\n", + "plot_data <- report_filter_recent_months(plot_data, \"Date\", setup$REPORT_PLOT_MONTHS)\n", + "run_report_plot(\"1.2 complétude district\", function() {\n", + " if (skip_heavy_plot_input(plot_data, \"1.2 plot_data\")) return(invisible(NULL))\n", + " n_ind <- dplyr::n_distinct(plot_data$Indicator)\n", + " ncol_guess <- max(1L, as.integer(ceiling(sqrt(n_ind))))\n", + " n_row <- as.integer(ceiling(n_ind / ncol_guess))\n", + " repr_h <- max(12, min(36, 6 + 1.28 * n_row))\n", + " options(repr.plot.width = 20, repr.plot.height = repr_h, repr.plot.res = 120)\n", + " p <- ggplot2::ggplot(plot_data, ggplot2::aes(x = Date, y = Percentage, fill = Status)) +\n", + " ggplot2::geom_col(position = \"stack\") +\n", + " ggplot2::facet_wrap(~ Indicator, scales = \"free_y\") +\n", + " ggplot2::scale_y_continuous(limits = c(0, 100)) +\n", + " ggplot2::scale_fill_manual(values = c(\n", + " \"Valeur manquante\" = \"tomato\", \"Valeur nulle rapportée\" = \"skyblue\",\n", + " \"Valeur positive rapportée\" = \"green\"\n", + " )) +\n", + " ggplot2::labs(\n", + " title = \"Taux de rapportage par indicateur (niveau district)\",\n", + " subtitle = \"Proportion des districts (ADM2_ID) rapportant chaque mois\",\n", + " x = \"Mois\", y = \"% des districts\", fill = \"Statut du rapportage\"\n", + " ) +\n", + " ggplot2::theme_minimal(base_size = 14) +\n", + " ggplot2::theme(\n", + " plot.title = ggplot2::element_text(face = \"bold\", size = 18),\n", + " strip.text = ggplot2::element_text(size = 14),\n", + " axis.title = ggplot2::element_text(size = 14),\n", + " axis.text = ggplot2::element_text(size = 12)\n", + " )\n", + " print(p)\n", + "})\n", + "report_release_objects(\"data_long\", \"data\", \"plot_data\", \"reporting_summary\", full_gc = TRUE)\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "e9b40af7-ace3-479c-b5d8-311ab6dd6455", + "metadata": {}, + "source": [ + "## 2. Cohérence des indicateurs\n", + "\n", + "Agrégation **ADM2 × mois** sur la fenêtre d’années chargée. **Courbe nationale :** construction des mois à partir de PERIOD (année + mois → date du 1er du mois) ; les **derniers mois dont les totaux nationaux sont entièrement à 0** (extrait souvent incomplet en fin de série, `sum(..., na.rm=TRUE)` quand toutes les valeurs OU sont NA) sont **retirés** pour éviter une chute artificielle des courbes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09f687f0-067b-4cc1-abad-343cbb828ddc", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "routine_hd_month <- routine_adm2_month_from_wide(routine_data)\n", + "routine_hd_month <- report_downsample_rows(routine_hd_month, 12000L)\n", + "run_report_plot(\"2.2 nuages ADM2-mois\", function() {\n", + " if (nrow(routine_hd_month) == 0L) return(invisible(NULL))\n", + " if (!all(c(\"SUSP\", \"TEST\", \"CONF\") %in% names(routine_hd_month))) {\n", + " message(\"Colonnes SUSP/TEST/CONF manquantes après agrégation — nuages omis.\")\n", + " return(invisible(NULL))\n", + " }\n", + " if (skip_heavy_plot_input(routine_hd_month, \"2.2 routine_hd_month\")) return(invisible(NULL))\n", + " options(repr.plot.width = 18, repr.plot.height = 8, repr.plot.res = 120)\n", + " p1 <- ggplot2::ggplot(routine_hd_month, ggplot2::aes(x = SUSP, y = TEST)) + ggplot2::geom_point(alpha = 0.5, color = \"blue\") +\n", + " ggplot2::geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", + " ggplot2::labs(title = \"Suspectés vs Testés\", x = \"Cas suspectés\", y = \"Cas testés\") + ggplot2::theme_minimal(base_size = 16)\n", + " p2 <- ggplot2::ggplot(routine_hd_month, ggplot2::aes(x = TEST, y = CONF)) + ggplot2::geom_point(alpha = 0.5, color = \"darkgreen\") +\n", + " ggplot2::geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", + " ggplot2::labs(title = \"Testés vs Confirmés\", x = \"Cas testés\", y = \"Cas confirmés\") + ggplot2::theme_minimal(base_size = 16)\n", + " if (all(c(\"CONF\", \"MALTREAT\") %in% names(routine_hd_month))) {\n", + " p3 <- ggplot2::ggplot(routine_hd_month, ggplot2::aes(x = CONF, y = MALTREAT)) + ggplot2::geom_point(alpha = 0.5, color = \"purple\") +\n", + " ggplot2::geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"red\") +\n", + " ggplot2::labs(title = \"Confirmés vs Traités\", x = \"Cas confirmés\", y = \"Cas traités\") + ggplot2::theme_minimal(base_size = 16)\n", + " } else {\n", + " p3 <- ggplot2::ggplot() + ggplot2::theme_void() + ggplot2::labs(title = \"MALTREAT absent\")\n", + " }\n", + " print((p1 | p2 | p3) + patchwork::plot_layout(guides = \"collect\"))\n", + "})\n", + "rds_clean_month <- routine_national_month_from_wide(routine_data)\n", + "run_report_plot(\"2.2 tendances nationales\", function() {\n", + " if (nrow(rds_clean_month) == 0L) return(invisible(NULL))\n", + " if (skip_heavy_plot_input(rds_clean_month, \"2.2 rds_clean_month\")) return(invisible(NULL))\n", + " vars_nat <- intersect(c(\"SUSP\", \"TEST\", \"CONF\", \"PRES\"), names(rds_clean_month))\n", + " if (length(vars_nat) == 0L) return(invisible(NULL))\n", + " options(repr.plot.width = 16, repr.plot.height = 8, repr.plot.res = 120)\n", + " p <- rds_clean_month %>%\n", + " tidyr::pivot_longer(dplyr::all_of(vars_nat), names_to = \"Indicator\") %>%\n", + " ggplot2::ggplot(ggplot2::aes(x = DATE, y = value, color = Indicator)) +\n", + " ggplot2::geom_line(linewidth = 1.2) +\n", + " ggplot2::labs(\n", + " title = \"Tendances mensuelles nationales des indicateurs composites\",\n", + " x = \"Mois\", y = \"Nombre de cas\", color = \"Indicateur\"\n", + " ) +\n", + " ggplot2::theme_minimal(base_size = 16) +\n", + " ggplot2::theme(\n", + " plot.title = ggplot2::element_text(face = \"bold\", size = 20),\n", + " axis.title = ggplot2::element_text(size = 16),\n", + " axis.text = ggplot2::element_text(size = 16),\n", + " legend.title = ggplot2::element_text(size = 16),\n", + " legend.text = ggplot2::element_text(size = 16)\n", + " )\n", + " print(p)\n", + "})\n", + "report_release_objects(\"routine_hd_month\", \"rds_clean_month\", \"routine_data\", full_gc = TRUE)\n" + ] + }, + { + "cell_type": "markdown", + "id": "da14aaa5-8469-4f1e-92c8-20d862bc24a6", + "metadata": {}, + "source": [ + "## 3. Cartes population ADM2 et complétude population" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5004276-166d-4f86-b1a3-bce5bf9a897c", + "metadata": { + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "map_data <- dplyr::left_join(shapes_data, population_data, by = \"ADM2_ID\")\n", + "run_report_plot(\"3.1 carte population\", function() {\n", + " if (skip_heavy_plot_input(map_data, \"3.1 map_data\")) return(invisible(NULL))\n", + " options(repr.plot.width = 20, repr.plot.height = 12, repr.plot.res = 120)\n", + " p <- ggplot2::ggplot(map_data) +\n", + " ggplot2::geom_sf(ggplot2::aes(fill = POPULATION), color = \"white\", linewidth = 0.2) +\n", + " ggplot2::scale_fill_viridis_c(option = \"C\", name = \"Population\") +\n", + " ggplot2::labs(\n", + " title = \"Population totale par district sanitaire (DS)\",\n", + " subtitle = \"Données DHIS2\",\n", + " caption = \"Source: NMDR / DHIS2\"\n", + " ) +\n", + " ggplot2::theme_minimal(base_size = 14)\n", + " print(p)\n", + "})\n", + "rm(map_data)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "414c0241-3706-4500-a85d-e457b070acce", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "population_data_filtered <- population_data\n", + "if (identical(setup$COUNTRY_CODE, \"NER\")) {\n", + " years_to_keep <- 2022:2024\n", + " population_data_filtered <- dplyr::filter(population_data, YEAR %in% years_to_keep)\n", + " metadata_json <- jsonlite::fromJSON(file.path(setup$CONFIG_PATH, \"SNT_metadata.json\"))\n", + " value_breaks_tot <- jsonlite::fromJSON(metadata_json$POPULATION_TOTAL$SCALE)\n", + " value_breaks_u5 <- jsonlite::fromJSON(metadata_json$POPULATION_U5$SCALE)\n", + " value_breaks_fe <- jsonlite::fromJSON(metadata_json$POPULATION_PREGNANT$SCALE)\n", + " labels_tot <- create_dynamic_labels(value_breaks_tot)\n", + " labels_u5 <- create_dynamic_labels(value_breaks_u5)\n", + " labels_fe <- create_dynamic_labels(value_breaks_fe)\n", + " NER_palette_population <- c(\"1\" = \"#fae6db\", \"2\" = \"#f1b195\", \"3\" = \"#ea7354\", \"4\" = \"#cc3f32\", \"5\" = \"#972620\")\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dfd2d93-23ba-4b0f-8e07-76605d722d5d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (identical(setup$COUNTRY_CODE, \"NER\")) {\n", + " if (length(NER_palette_population) != length(labels_tot)) {\n", + " stop(\"NER: longueur palette (\", length(NER_palette_population), \") != labels_tot (\", length(labels_tot), \").\")\n", + " }\n", + " names(NER_palette_population) <- labels_tot\n", + " run_report_plot(\"NER choropleth population totale\", function() {\n", + " options(repr.plot.width = 20, repr.plot.height = 12, repr.plot.res = 120)\n", + " plot <- population_data_filtered %>% dplyr::mutate(CATEGORY_POPULATION = cut(POPULATION,\n", + " breaks = c(0, value_breaks_tot, Inf), labels = labels_tot, right = TRUE, include.lowest = TRUE)) %>%\n", + " dplyr::left_join(shapes_data, by = dplyr::join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>%\n", + " ggplot2::ggplot() + ggplot2::geom_sf(ggplot2::aes(geometry = geometry, fill = CATEGORY_POPULATION), linewidth = 0.15) +\n", + " ggplot2::facet_wrap(~YEAR, ncol = 3) + ggplot2::theme_void() + ggplot2::scale_fill_manual(values = NER_palette_population, limits = labels_tot, drop = FALSE) +\n", + " ggplot2::labs(title = \"Population totale (NER)\", subtitle = \"Source: NMDR / DHIS2\")\n", + " print(plot)\n", + " ggplot2::ggsave(file.path(setup$figures_dir, paste0(setup$COUNTRY_CODE, \"_choropleth_population_totals.png\")), plot, width = 12, height = 7, dpi = setup$REPORT_FIG_DPI)\n", + " })\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80d1ff8b-8afa-4e86-a266-5e974adc803f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "pop_scatter <- report_downsample_rows(population_data_filtered, setup$REPORT_POP_SCATTER_MAX_ROWS)\n", + "run_report_plot(\"Population totale vs district\", function() {\n", + " if (skip_heavy_plot_input(pop_scatter, \"3.2 population scatter\")) return(invisible(NULL))\n", + " n_adm1 <- dplyr::n_distinct(pop_scatter$ADM1_NAME)\n", + " repr_h <- max(26, min(42, 12 + 2.8 * n_adm1))\n", + " options(repr.plot.width = 20, repr.plot.height = repr_h, repr.plot.res = 120)\n", + " p <- ggplot2::ggplot(pop_scatter) +\n", + " ggplot2::geom_point(ggplot2::aes(x = POPULATION, y = forcats::fct_reorder(ADM2_NAME, POPULATION), color = factor(YEAR)), size = 0.5, alpha = 0.65) +\n", + " ggplot2::facet_grid(rows = ggplot2::vars(ADM1_NAME), scales = \"free_y\", space = \"free_y\", switch = \"y\") +\n", + " ggplot2::scale_x_continuous(labels = scales::comma) +\n", + " ggplot2::labs(color = \"Année\") +\n", + " ggplot2::theme_minimal() +\n", + " ggplot2::theme(\n", + " axis.text = ggplot2::element_text(size = 7),\n", + " axis.title.x = ggplot2::element_text(size = 7),\n", + " axis.title.y = ggplot2::element_blank(),\n", + " strip.placement = \"outside\",\n", + " panel.grid.minor.x = ggplot2::element_blank(),\n", + " legend.position = \"bottom\"\n", + " )\n", + " print(p)\n", + " ggplot2::ggsave(file.path(setup$figures_dir, \"hist_population_totale.png\"), p, width = 15, height = 23, units = \"cm\", dpi = setup$REPORT_FIG_DPI, bg = \"white\")\n", + "})\n" + ] + }, + { + "cell_type": "markdown", + "id": "dbbed77e-d7d9-4ce8-b72a-4bb7f374d5af", + "metadata": {}, + "source": [ + "_Fin du rapport._" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r index a547e46..5074d91 100644 --- a/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r +++ b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r @@ -4,20 +4,18 @@ source(file.path("~/workspace/code", "snt_utils.r")) -#' Get Setup Variables for SNT Workspace -#' Initializes workspace paths, loads R packages, and imports OpenHEXA SDK. +#' SNT workspace setup (paths, packages, OpenHEXA SDK) #' #' @param SNT_ROOT_PATH Character. Root path of the SNT workspace. Default: '~/workspace' #' @param packages Character vector. R packages to install and load. #' @return List with SNT paths. #' #' @export -get_setup_variables <- function( +snt_setup <- function( SNT_ROOT_PATH='~/workspace', packages=c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "glue") ) { - # List required pcks required_packages <- unique(c(packages, "reticulate")) install_and_load(required_packages) @@ -45,6 +43,9 @@ get_setup_variables <- function( return(paths_to_check) } +# Alias (older name in notebooks / docs) +get_setup_variables <- snt_setup + #' Load SNT Configuration File #' Reads and parses a JSON configuration file. @@ -74,16 +75,21 @@ load_snt_config <- function(snt_config_path) { #' @return Dataframe containing the loaded data. #' #' @export -load_dataset_file <- function (dataset_id, filename, verbose=TRUE) { - data <- tryCatch({ - get_latest_dataset_file_in_memory(dataset_id, filename) - }, error = function(e) { +load_dataset_file <- function(dataset_id, filename, verbose = TRUE) { + if (!exists("openhexa", inherits = TRUE) || is.null(get("openhexa", inherits = TRUE))) { + stop("OpenHEXA SDK is not loaded or available.") + } + data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_id, filename) + }, + error = function(e) { stop(glue::glue("[ERROR] Error while loading {filename} file from dataset: {dataset_id}")) - }) - - if (verbose) { + } + ) + if (verbose) { log_msg(glue::glue("{filename} data loaded from dataset : {dataset_id} dataframe dimensions: [{paste(dim(data), collapse=', ')}]")) - } + } return(data) } @@ -312,7 +318,6 @@ merge_and_format_routine_data <- function(data, metadata, indicator_definitions) #' (default: 4326 for WGS84). #' #' @return An sf object with the GEOMETRY column converted to sfc geometry. -#' Invalid or NA geometries are replaced with empty geometry collections. #' #' @export geojson_to_sf <- function(data, geom_col = "GEOMETRY", crs = 4326) { @@ -327,24 +332,21 @@ geojson_to_sf <- function(data, geom_col = "GEOMETRY", crs = 4326) { # Use the matched column geom_col <- geom_col_match[1] + if (any(is.na(data[[geom_col]]) | !nzchar(as.character(data[[geom_col]])))) { + stop(paste0("Error: Column '", geom_col, "' contains missing GeoJSON values")) + } + # Convert GeoJSON strings to sfg objects geometry_sfc <- lapply(data[[geom_col]], function(g) { - if (is.na(g) || is.null(g)) return(sf::st_geometrycollection()) - tryCatch({ - geo <- geojsonsf::geojson_sfc(g) - geo[[1]] # extract sfg - }, error = function(e) { - sf::st_geometrycollection() # return empty but valid geometry - }) + geo <- geojsonsf::geojson_sfc(g) + geo[[1]] }) # Convert to sfc geometry_sfc <- sf::st_sfc(geometry_sfc, crs = crs) - # Create sf object (exclude original geometry column) cols_to_keep <- setdiff(names(data), geom_col) data_no_geom <- data[, cols_to_keep, drop = FALSE] - # data_no_geom <- data[, !names(data) %in% geom_col, drop = FALSE] shapes_sf <- sf::st_sf(data_no_geom, geometry = geometry_sfc) return(shapes_sf) @@ -700,59 +702,22 @@ extract_geo_coords <- function(geom_json) { } -#' Safely Read GeoJSON File +#' Read GeoJSON File #' -#' @description Reads a GeoJSON file from a specified path with built-in error handling. -#' Checks if the file exists and catches parsing errors if the file is corrupted. +#' @description Reads a GeoJSON file from a specified path. #' #' @param file_path A character string specifying the full path to the GeoJSON file. #' -#' @return A spatial dataframe (sf object) if successful, or NULL if the process fails. +#' @return A spatial dataframe (sf object). #' @export read_geojson_safe <- function(file_path) { - - # 1. Check if the file exists in the folder if (!file.exists(file_path)) { - # If you have a custom log_msg function from earlier, you can swap 'message' for it! - log_msg(glue("File does not exist at the specified path: {file_path}"), "error") - return(NULL) + stop(glue("GeoJSON file does not exist: {file_path}")) } - - # 2. Try to read the file and catch corruption/parsing errors - geo_data <- tryCatch({ sf::read_sf(file_path, quiet = TRUE)}, - error = function(e) { - log_msg(glue("Failed to parse the GeoJSON file. It may be corrupted. R says: {e$message}"), "error") - return(NULL) - }) - - return(geo_data) + sf::read_sf(file_path, quiet = TRUE) } -#' Safely Read GeoJSON File -#' -#' @description Reads a GeoJSON file from a specified path with built-in error handling. -#' Checks if the file exists and catches parsing errors if the file is corrupted. -#' -#' @details This function wraps \code{sf::read_sf()} inside a \code{tryCatch} block. -#' It is particularly useful in automated data pipelines where missing or corrupted -#' geographic files should be logged but shouldn't crash the entire script. -#' -#' @param file_path A character string specifying the full path to the GeoJSON file. -#' -#' @return A spatial dataframe (\code{sf} object) if successful, or \code{NULL} if the process fails. -#' -#' @examples -#' \dontrun{ -#' # Example of a successful read -#' my_geo_data <- read_geojson_safe("data/valid_regions.geojson") -#' -#' # Example of handling a missing file gracefully (returns NULL) -#' missing_data <- read_geojson_safe("data/does_not_exist.geojson") -#' } -#' -#' @importFrom sf read_sf -#' @importFrom glue glue #' @export points_within_country_batch <- function(lon_vec, lat_vec, boundary_sf) { out <- rep(FALSE, length(lon_vec)) @@ -791,9 +756,8 @@ prepare_country_boundary <- function(country_shapes_sf) { stop("[ERROR] Country shapes must be an sf object.") } - # warn the user if the CRS is missing, but don't guess what it is! if (is.na(sf::st_crs(country_shapes_sf))) { - warning("CRS is missing from the input shapefile. Proceeding without a defined CRS.") + stop("[ERROR] CRS is missing from the input shapefile.") } country_boundary <- sf::st_union(sf::st_geometry(country_shapes_sf)) @@ -802,47 +766,6 @@ prepare_country_boundary <- function(country_shapes_sf) { return(sf::st_sf(GEOMETRY = country_boundary)) } - - -fix_coordinate_pair_in_country <- function(lon, lat, boundary_sf, max_shift = 2) { - if (is.na(lon) || is.na(lat)) { - return(list(LONGITUDE = NA_real_, LATITUDE = NA_real_, METHOD = "MISSING_COORDINATES", VALID = FALSE)) - } - - candidates <- build_coordinate_candidates(lon, lat, max_shift = max_shift) - candidate_names <- names(candidates) - - m <- matrix(NA_real_, nrow = length(candidate_names), ncol = 2) - for (j in seq_along(candidate_names)) { - cand <- candidates[[candidate_names[j]]] - m[j, 1] <- as.numeric(cand[1]) - m[j, 2] <- as.numeric(cand[2]) - } - - earth_ok <- abs(m[, 1]) <= 180 & abs(m[, 2]) <= 90 - if (!any(earth_ok)) { - return(list(LONGITUDE = NA_real_, LATITUDE = NA_real_, METHOD = "INVALID_NO_MATCH", VALID = FALSE)) - } - - pts <- sf::st_as_sf( - data.frame(LONGITUDE = m[earth_ok, 1], LATITUDE = m[earth_ok, 2]), - coords = c("LONGITUDE", "LATITUDE"), - crs = 4326 - ) - - inside <- rep(FALSE, nrow(m)) - inside[earth_ok] <- as.logical(sf::st_within(pts, boundary_sf, sparse = FALSE)[, 1]) - ok_idx <- which(earth_ok & inside) - - if (length(ok_idx) == 0) { - return(list(LONGITUDE = NA_real_, LATITUDE = NA_real_, METHOD = "INVALID_NO_MATCH", VALID = FALSE)) - } - - j <- min(ok_idx) - list(LONGITUDE = m[j, 1], LATITUDE = m[j, 2], METHOD = candidate_names[j], VALID = TRUE) -} - - #' Fix Coordinate Pair within Country Boundary #' #' @description Takes a longitude and latitude pair, generates potential candidate @@ -863,10 +786,6 @@ fix_coordinate_pair_in_country <- function(lon, lat, boundary_sf, max_shift = 2) if (is.na(lon) || is.na(lat)) { return(list(LONGITUDE = NA_real_, LATITUDE = NA_real_, METHOD = "MISSING_COORDINATES", VALID = FALSE)) } - - if (is.na(lon) || is.na(lat)) { - return(list(LONGITUDE = NA_real_, LATITUDE = NA_real_, METHOD = "MISSING_COORDINATES", VALID = FALSE)) - } candidates <- build_coordinate_candidates(lon, lat, max_shift = max_shift) candidate_names <- names(candidates) @@ -996,8 +915,7 @@ plot_fixed_coordinates <- function(fix_results, shapes_sf_boundary) { pts_df <- pts_df[!is.na(pts_df$LONGITUDE) & !is.na(pts_df$LATITUDE), ] if (nrow(pts_df) == 0) { - message("No valid coordinates to plot.") - return(NULL) + stop("No valid coordinates to plot.") } # 3. Convert the regular dataframe into an 'sf' spatial object (assuming standard 4326 CRS) @@ -1021,4 +939,303 @@ plot_fixed_coordinates <- function(fix_results, shapes_sf_boundary) { # Return the plot object in case you want to save it later using ggsave() invisible(map_plot) -} \ No newline at end of file +} + + +#' Write a ggplot to PNG (prefer this over huge objects kept in-session for notebooks). +save_ggplot_png <- function(plot, path, width, height, dpi = 120L, bg = "white") { + ggplot2::ggsave( + filename = path, + plot = plot, + width = width, + height = height, + dpi = dpi, + limitsize = FALSE, + bg = bg + ) + invisible(path) +} + + +#' Show a PNG in IRkernel / Jupyter after [save_ggplot_png] (frees RAM vs holding ggplot). +display_saved_png <- function(path) { + if (!file.exists(path)) { + stop("Missing plot file: ", path) + } + if (requireNamespace("IRdisplay", quietly = TRUE)) { + IRdisplay::display_png(file = path) + } else { + message("Saved plot (install IRdisplay for inline preview): ", path) + } + invisible(path) +} + + +normalize_report_plot_mode <- function(m) { + m <- tolower(as.character(m)[1]) + if (!m %in% c("none", "light", "full")) { + return("full") + } + m +} + + +#' Plot policy for the formatting report: `full`, `light` (smaller PNG / dpi), or `none` (skip figures). +#' +#' Set Papermill parameter `SNT_FORMAT_REPORT_PLOTS` or environment variable of the same name. +#' Env wins when non-empty so CI / OpenHEXA can force `none` without changing the notebook JSON. +report_plots_mode <- function() { + envv <- Sys.getenv("SNT_FORMAT_REPORT_PLOTS", unset = "") + if (nzchar(envv)) { + return(normalize_report_plot_mode(envv)) + } + if (exists("SNT_FORMAT_REPORT_PLOTS", envir = .GlobalEnv, inherits = FALSE)) { + return(normalize_report_plot_mode(get("SNT_FORMAT_REPORT_PLOTS", envir = .GlobalEnv))) + } + "full" +} + + +report_plots_skip <- function() { + identical(report_plots_mode(), "none") +} + + +report_plots_light <- function() { + identical(report_plots_mode(), "light") +} + + +#' Like [save_ggplot_png] but honours `SNT_FORMAT_REPORT_PLOTS`. +save_report_plot_png <- function(plot, path, width, height, dpi = 120L, bg = "white") { + if (report_plots_skip()) { + message("[SNT_FORMAT_REPORT_PLOTS=none] Figure omise : ", path) + return(invisible(NULL)) + } + if (report_plots_light()) { + dpi <- max(50L, as.integer(dpi * 0.6)) + width <- width * 0.65 + height <- height * 0.65 + } + save_ggplot_png(plot, path, width = width, height = height, dpi = dpi, bg = bg) +} + + +#' Like [display_saved_png] unless plots are disabled. +display_report_png <- function(path) { + if (report_plots_skip()) { + return(invisible(NULL)) + } + display_saved_png(path) +} + + +#' `print` a ggplot unless plots are disabled (avoids building draw buffers when `none`). +maybe_print_ggplot <- function(plot, label = "figure") { + if (report_plots_skip()) { + message("[SNT_FORMAT_REPORT_PLOTS=none] Affichage omis : ", label) + return(invisible(NULL)) + } + print(plot) +} + + +#' Wrapper around [ggplot2::ggsave] for notebook chunks (NER choropleths, etc.). +save_report_builtin_ggsave <- function(plot, filename, ...) { + if (report_plots_skip()) { + message("[SNT_FORMAT_REPORT_PLOTS=none] ggsave omis : ", filename) + return(invisible(NULL)) + } + args <- list(filename = filename, plot = plot) + dots <- list(...) + if (report_plots_light()) { + if (!is.null(dots$dpi)) { + dots$dpi <- max(48L, as.integer(dots$dpi * 0.55)) + } + if (!is.null(dots$width)) { + dots$width <- dots$width * 0.65 + } + if (!is.null(dots$height)) { + dots$height <- dots$height * 0.65 + } + } + do.call(ggplot2::ggsave, c(args, dots)) +} + + +#' Remove named objects if they exist, then run a light [gc]. +rm_if_exists <- function(names, envir = parent.frame()) { + names <- unique(as.character(names)) + found <- names[names %in% ls(envir = envir, all.names = FALSE)] + if (length(found)) { + rm(list = found, envir = envir) + } + invisible(gc(verbose = FALSE, full = FALSE)) +} + + +#' Split a character vector (e.g. indicator column names) into chunks of at most `batch_size`. +chunk_name_vector <- function(x, batch_size = 12L) { + batch_size <- max(1L, as.integer(batch_size)) + x <- as.character(x) + n <- length(x) + if (n == 0L) { + return(list()) + } + brks <- ceiling(seq_len(n) / batch_size) + unname(split(x, brks)) +} + + +#' HF completeness for one wide indicator column (no `pivot_longer` over all indicators). +summarise_completeness_hf_wide_one <- function(data_chunk, indicator_name) { + if (!indicator_name %in% names(data_chunk)) { + stop("Unknown indicator column: ", indicator_name) + } + long_one <- dplyr::transmute( + data_chunk, + OU = .data$OU_ID, + DATE = as.Date(paste0(.data$PERIOD, "01"), format = "%Y%m%d"), + INDICATOR = indicator_name, + VALUE = as.numeric(.data[[indicator_name]]) + ) + summarise_completeness_hf_long(long_one) +} + + +#' ADM2 completeness for one wide indicator (no factorial `crossing` over all indicators). +summarise_completeness_adm2_wide_one <- function(data_chunk, indicator_name) { + if (!indicator_name %in% names(data_chunk)) { + stop("Unknown indicator column: ", indicator_name) + } + + dc <- data_chunk %>% + dplyr::mutate(Date = as.Date(paste0(.data$PERIOD, "01"), format = "%Y%m%d")) + + U <- dplyr::distinct(dc, .data$ADM2_ID, .data$Date) + + tmp <- dplyr::transmute( + dc, + ADM2_ID = .data$ADM2_ID, + Date = .data$Date, + value = as.numeric(.data[[indicator_name]]) + ) + + rolled <- tmp %>% + dplyr::group_by(.data$ADM2_ID, .data$Date) %>% + dplyr::summarise( + is_missing = all(is.na(.data$value)), + is_zero = all(.data$value == 0, na.rm = TRUE), + is_positive = any(.data$value > 0, na.rm = TRUE), + .groups = "drop" + ) %>% + dplyr::mutate(Indicator = indicator_name) + + full_one <- U %>% + dplyr::mutate(Indicator = indicator_name) %>% + dplyr::left_join(rolled, by = c("ADM2_ID", "Date", "Indicator")) %>% + dplyr::mutate( + is_missing = tidyr::replace_na(.data$is_missing, TRUE), + is_zero = tidyr::replace_na(.data$is_zero, FALSE), + is_positive = tidyr::replace_na(.data$is_positive, FALSE) + ) + + out <- full_one %>% + dplyr::group_by(.data$Indicator, .data$Date) %>% + dplyr::summarise( + n_total = dplyr::n_distinct(.data$ADM2_ID), + n_missing = sum(.data$is_missing), + n_zero = sum(.data$is_zero & !.data$is_missing), + n_positive = sum(.data$is_positive), + .groups = "drop" + ) %>% + dplyr::mutate( + pct_missing = ifelse(.data$n_total > 0, 100 * .data$n_missing / .data$n_total, 0), + pct_zero = ifelse(.data$n_total > 0, 100 * .data$n_zero / .data$n_total, 0), + pct_positive = ifelse(.data$n_total > 0, 100 * .data$n_positive / .data$n_total, 0) + ) + + rm(dc, U, tmp, rolled, full_one) + invisible(gc(verbose = FALSE, full = FALSE)) + out +} + + +#' Summarise HF-level completeness from long routine (OU × INDICATOR × DATE). +summarise_completeness_hf_long <- function(long_df) { + long_df %>% + dplyr::mutate( + is_missing = is.na(.data$VALUE), + is_zero = !is.na(.data$VALUE) & .data$VALUE == 0, + is_positive = !is.na(.data$VALUE) & .data$VALUE > 0 + ) %>% + dplyr::group_by(.data$INDICATOR, .data$DATE) %>% + dplyr::summarise( + n_total = dplyr::n_distinct(.data$OU), + n_missing = sum(.data$is_missing), + n_zero = sum(.data$is_zero), + n_positive = sum(.data$is_positive), + .groups = "drop" + ) %>% + dplyr::mutate( + pct_missing = ifelse(.data$n_total > 0, 100 * .data$n_missing / .data$n_total, 0), + pct_zero = ifelse(.data$n_total > 0, 100 * .data$n_zero / .data$n_total, 0), + pct_positive = ifelse(.data$n_total > 0, 100 * .data$n_positive / .data$n_total, 0) + ) +} + + +#' ADM2-level completeness summary for one year-chunk of wide routine data. +summarise_completeness_adm2_chunk <- function(data_chunk, indicator_cols) { + data_chunk <- data_chunk %>% + dplyr::mutate(Date = as.Date(paste0(.data$PERIOD, "01"), format = "%Y%m%d")) + + data_long_chunk <- data_chunk %>% + dplyr::select( + "ADM2_ID", "OU_ID", "Date", + tidyselect::all_of(indicator_cols) + ) %>% + tidyr::pivot_longer( + cols = tidyselect::all_of(indicator_cols), + names_to = "Indicator", + values_to = "value" + ) %>% + dplyr::mutate(value = as.numeric(.data$value)) + + out <- data_chunk %>% + dplyr::distinct(.data$ADM2_ID, .data$Date) %>% + tidyr::crossing(Indicator = indicator_cols) %>% + dplyr::left_join( + data_long_chunk %>% + dplyr::group_by(.data$ADM2_ID, .data$Indicator, .data$Date) %>% + dplyr::summarise( + is_missing = all(is.na(.data$value)), + is_zero = all(.data$value == 0, na.rm = TRUE), + is_positive = any(.data$value > 0, na.rm = TRUE), + .groups = "drop" + ), + by = c("ADM2_ID", "Indicator", "Date") + ) %>% + dplyr::mutate( + is_missing = tidyr::replace_na(.data$is_missing, TRUE), + is_zero = tidyr::replace_na(.data$is_zero, FALSE), + is_positive = tidyr::replace_na(.data$is_positive, FALSE) + ) %>% + dplyr::group_by(.data$Indicator, .data$Date) %>% + dplyr::summarise( + n_total = dplyr::n_distinct(.data$ADM2_ID), + n_missing = sum(.data$is_missing), + n_zero = sum(.data$is_zero & !.data$is_missing), + n_positive = sum(.data$is_positive), + .groups = "drop" + ) %>% + dplyr::mutate( + pct_missing = ifelse(.data$n_total > 0, 100 * .data$n_missing / .data$n_total, 0), + pct_zero = ifelse(.data$n_total > 0, 100 * .data$n_zero / .data$n_total, 0), + pct_positive = ifelse(.data$n_total > 0, 100 * .data$n_positive / .data$n_total, 0) + ) + + rm(data_long_chunk) + invisible(gc(verbose = FALSE, full = FALSE)) + out +} diff --git a/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r index 3d5199a..ba8ca7d 100644 --- a/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r +++ b/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting_report.r @@ -1,6 +1,10 @@ -# Shared helpers for snt_dhis2_formatting reporting notebook. +# Helpers et orchestration pour le rapport Jupyter / Papermill (snt_dhis2_formatting_report.ipynb). printdim <- function(df, name = deparse(substitute(df))) { + if (is.null(df)) { + cat("Dimensions of", name, ": NULL (not loaded)\n\n") + return(invisible(NULL)) + } cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") } @@ -26,3 +30,502 @@ create_dynamic_labels <- function(breaks) { paste0("> ", fmt(breaks[length(breaks)]), "k") ) } + + +# ---- Helpers utilisés par les cellules du rapport (mémoire, plafonds, agrégations) ---- + +#' Limite le nombre d'indicateurs distincts (ordre alphabétique, comme la section 1.2). +report_limit_long_indicators <- function(long_data, indicator_col = "INDICATOR", max_n = 40L) { + max_n <- suppressWarnings(as.integer(max_n)[1]) + if (is.null(long_data) || !NROW(long_data)) { + return(long_data) + } + if (!indicator_col %in% names(long_data)) { + stop("report_limit_long_indicators: colonne manquante : ", indicator_col) + } + if (!length(max_n) || is.na(max_n) || max_n < 1L) { + return(long_data) + } + lvls <- sort(unique(as.character(long_data[[indicator_col]]))) + if (length(lvls) <= max_n) { + return(long_data) + } + keep <- lvls[seq_len(max_n)] + dplyr::filter(long_data, as.character(.data[[indicator_col]]) %in% keep) +} + + +#' Supprime des objets du `.GlobalEnv` puis `gc` (allège la RAM entre grosses étapes). +report_release_objects <- function(..., full_gc = FALSE) { + nms <- unique(as.character(unlist(list(...), use.names = FALSE))) + for (n in nms) { + if (exists(n, envir = .GlobalEnv, inherits = FALSE)) { + rm(list = n, envir = .GlobalEnv) + } + } + if (isTRUE(full_gc)) { + invisible(gc(verbose = FALSE, full = TRUE)) + } else { + invisible(gc(verbose = FALSE, full = FALSE)) + } + invisible(NULL) +} + + +#' Garde les `n_months` derniers mois calendaires par rapport au max de `date_col`. +report_filter_recent_months <- function(df, date_col, n_months) { + if (is.null(df) || !NROW(df)) { + return(df) + } + n_months <- suppressWarnings(as.integer(n_months)[1]) + if (!length(n_months) || is.na(n_months) || n_months < 1L) { + return(df) + } + d <- as.Date(df[[date_col]]) + maxd <- suppressWarnings(max(d, na.rm = TRUE)) + if (!is.finite(as.numeric(maxd))) { + return(df) + } + cutoff <- suppressWarnings(min(seq.Date(maxd, by = "-1 month", length.out = n_months), na.rm = TRUE)) + dplyr::filter(df, !is.na(.data[[date_col]]), as.Date(.data[[date_col]]) >= cutoff) +} + + +#' Exécute un bloc graphique. +run_report_plot <- function(label, plot_fn) { + if (exists("report_plots_skip", mode = "function", inherits = TRUE) && isTRUE(report_plots_skip())) { + message("[SNT_FORMAT_REPORT_PLOTS=none] Bloc omis : ", label) + return(invisible(NULL)) + } + plot_fn() +} + + +#' TRUE si le tracé doit être omis (politique `none` ou trop de lignes vs `SNT_REPORT_PLOT_MAX_ROWS`). +skip_heavy_plot_input <- function(df, label = "plot input") { + if (exists("report_plots_skip", mode = "function", inherits = TRUE) && isTRUE(report_plots_skip())) { + return(TRUE) + } + lim <- suppressWarnings(as.integer(Sys.getenv("SNT_REPORT_PLOT_MAX_ROWS", unset = "1000000"))[1]) + if (!is.finite(lim) || lim < 1L) { + lim <- 1000000L + } + nr <- if (inherits(df, "sf")) { + nrow(sf::st_drop_geometry(df)) + } else { + nrow(as.data.frame(df)) + } + if (nr > lim) { + message("[skip] ", label, ": ", nr, " lignes > ", lim, " (SNT_REPORT_PLOT_MAX_ROWS)") + return(TRUE) + } + FALSE +} + + +#' Sous-échantillon aléatoire simple pour limiter le coût des nuages de points. +report_downsample_rows <- function(df, max_rows) { + max_rows <- suppressWarnings(as.integer(max_rows)[1]) + if (is.null(df) || !NROW(df)) { + return(df) + } + if (!length(max_rows) || is.na(max_rows) || max_rows < 1L) { + return(df) + } + nr <- nrow(df) + if (nr <= max_rows) { + return(df) + } + i <- sample.int(nr, max_rows) + df[i, , drop = FALSE] +} + + +#' Agrège la routine large en ADM2 × mois (sommes sur les OU) pour indicateurs composites usuels. +routine_adm2_month_from_wide <- function(routine_data) { + req <- c("PERIOD", "ADM2_ID") + miss <- setdiff(req, names(routine_data)) + if (length(miss)) { + stop("routine_adm2_month_from_wide: colonnes manquantes : ", paste(miss, collapse = ", ")) + } + inds <- intersect(c("SUSP", "TEST", "CONF", "MALTREAT", "PRES"), names(routine_data)) + b <- routine_data %>% + dplyr::mutate(DATE = as.Date(paste0(as.character(.data$PERIOD), "01"), format = "%Y%m%d")) + if (length(inds)) { + b %>% + dplyr::group_by(.data$ADM2_ID, .data$PERIOD, .data$DATE) %>% + dplyr::summarise( + dplyr::across(dplyr::all_of(inds), ~ sum(as.numeric(.x), na.rm = TRUE)), + .groups = "drop" + ) + } else { + dplyr::distinct(b, .data$ADM2_ID, .data$PERIOD, .data$DATE) + } +} + + +#' Totaux nationaux par mois ; retire en fin de série les mois entièrement à 0 (extrait incomplet). +routine_national_month_from_wide <- function(routine_data) { + vars_nat <- intersect(c("SUSP", "TEST", "CONF", "PRES"), names(routine_data)) + if (!length(vars_nat) || !"PERIOD" %in% names(routine_data)) { + return(dplyr::tibble(DATE = as.Date(character()))) + } + out <- routine_data %>% + dplyr::mutate(DATE = as.Date(paste0(as.character(.data$PERIOD), "01"), format = "%Y%m%d")) %>% + dplyr::group_by(.data$DATE) %>% + dplyr::summarise( + dplyr::across(dplyr::all_of(vars_nat), ~ sum(as.numeric(.x), na.rm = TRUE)), + .groups = "drop" + ) %>% + dplyr::arrange(.data$DATE) + if (!NROW(out)) { + return(out) + } + m <- as.matrix(out[, vars_nat, drop = TRUE]) + all_zero <- apply(m, 1L, function(row) all(row == 0 & !is.na(row))) + while (length(all_zero) && all_zero[length(all_zero)]) { + out <- out[-nrow(out), , drop = FALSE] + if (!NROW(out)) { + break + } + m <- as.matrix(out[, vars_nat, drop = TRUE]) + all_zero <- apply(m, 1L, function(row) all(row == 0 & !is.na(row))) + } + out +} + + +# Meme logique que expand_grid(OU, INDICATOR, DATE) + left_join(long_data), mais une +# grille OU x DATE a la fois par indicateur (RAM). Colonnes attendues = notebook +# snt_dhis2_formatting_report (OU, INDICATOR, DATE, VALUE). +reporting_summary_facility_chunked <- function(long_data) { + req <- c("OU", "INDICATOR", "DATE", "VALUE") + miss <- setdiff(req, names(long_data)) + if (length(miss) > 0L) { + stop(paste0("[ERROR] reporting_summary_facility_chunked: missing columns: ", paste(miss, collapse = ", "))) + } + + ld <- dplyr::transmute( + long_data, + OU = as.character(.data$OU), + INDICATOR = as.character(.data$INDICATOR), + DATE = as.Date(.data$DATE), + VALUE = .data$VALUE + ) + ld <- dplyr::filter(ld, !is.na(.data$INDICATOR), !is.na(.data$OU), !is.na(.data$DATE)) + ld <- dplyr::distinct(ld, OU, INDICATOR, DATE, .keep_all = TRUE) + + if (nrow(ld) == 0L) { + return(dplyr::tibble( + INDICATOR = character(), + DATE = as.Date(character()), + n_total = integer(), + n_missing = integer(), + n_zero = integer(), + n_positive = integer(), + pct_missing = numeric(), + pct_zero = numeric(), + pct_positive = numeric() + )) + } + + ou_levels <- dplyr::distinct(dplyr::select(ld, "OU")) + date_levels <- dplyr::distinct(dplyr::select(ld, "DATE")) + ou_date_grid <- tidyr::crossing(ou_levels, date_levels) + chunks <- split(ld, ld$INDICATOR, drop = TRUE) + + out <- purrr::imap_dfr(chunks, function(chunk, ind) { + ind <- as.character(ind) + rhs <- dplyr::transmute( + chunk, + OU = as.character(.data$OU), + INDICATOR = ind, + DATE = as.Date(.data$DATE), + VALUE = .data$VALUE + ) + rhs <- dplyr::distinct(rhs, OU, INDICATOR, DATE, .keep_all = TRUE) + dplyr::left_join( + dplyr::mutate(ou_date_grid, INDICATOR = ind), + rhs, + by = c("OU", "INDICATOR", "DATE") + ) %>% + dplyr::group_by(.data$INDICATOR, .data$DATE) %>% + dplyr::summarise( + n_total = dplyr::n_distinct(.data$OU), + n_missing = sum(is.na(.data$VALUE)), + n_zero = sum(.data$VALUE == 0 & !is.na(.data$VALUE)), + n_positive = sum(.data$VALUE > 0 & !is.na(.data$VALUE)), + pct_missing = dplyr::if_else(.data$n_total > 0L, 100 * .data$n_missing / .data$n_total, 0), + pct_zero = dplyr::if_else(.data$n_total > 0L, 100 * .data$n_zero / .data$n_total, 0), + pct_positive = dplyr::if_else(.data$n_total > 0L, 100 * .data$n_positive / .data$n_total, 0), + .groups = "drop" + ) + }) + + if (nrow(out) == 0L) { + return(dplyr::tibble( + INDICATOR = character(), + DATE = as.Date(character()), + n_total = integer(), + n_missing = integer(), + n_zero = integer(), + n_positive = integer(), + pct_missing = numeric(), + pct_zero = numeric(), + pct_positive = numeric() + )) + } + dplyr::arrange(out, .data$INDICATOR, .data$DATE) +} + + +# Meme idee au niveau ADM2. Colonnes attendues = notebook (ADM2_ID, Date, Indicator, value). +reporting_summary_adm2_chunked <- function(data_long) { + req <- c("ADM2_ID", "Date", "Indicator", "value") + miss <- setdiff(req, names(data_long)) + if (length(miss) > 0L) { + stop(paste0("[ERROR] reporting_summary_adm2_chunked: missing columns: ", paste(miss, collapse = ", "))) + } + + dl <- dplyr::transmute( + data_long, + ADM2_ID = as.character(.data$ADM2_ID), + Indicator = as.character(.data$Indicator), + Date = as.Date(.data$Date), + value = .data$value + ) + dl <- dplyr::filter(dl, !is.na(.data$Indicator), !is.na(.data$ADM2_ID), !is.na(.data$Date)) + + if (nrow(dl) == 0L) { + return(dplyr::tibble( + Indicator = character(), + Date = as.Date(character()), + n_total = integer(), + n_missing = integer(), + n_zero = integer(), + n_positive = integer(), + pct_missing = numeric(), + pct_zero = numeric(), + pct_positive = numeric() + )) + } + + adm_levels <- dplyr::distinct(dplyr::select(dl, "ADM2_ID")) + date_levels <- dplyr::distinct(dplyr::select(dl, "Date")) + adm_date_grid <- tidyr::crossing(adm_levels, date_levels) + + reporting_check <- dl %>% + dplyr::group_by(.data$ADM2_ID, .data$Indicator, .data$Date) %>% + dplyr::summarise( + is_missing = all(is.na(.data$value)), + is_zero = all(.data$value == 0, na.rm = TRUE), + is_positive = any(.data$value > 0, na.rm = TRUE), + .groups = "drop" + ) + + rc_chunks <- split(reporting_check, reporting_check$Indicator, drop = TRUE) + + out <- purrr::imap_dfr(rc_chunks, function(rc_chunk, ind) { + ind <- as.character(ind) + rc_chunk <- rc_chunk %>% + dplyr::mutate(Date = as.Date(.data$Date)) %>% + dplyr::distinct(ADM2_ID, Indicator, Date, .keep_all = TRUE) + rhs <- dplyr::transmute( + rc_chunk, + ADM2_ID = as.character(.data$ADM2_ID), + Indicator = ind, + Date = as.Date(.data$Date), + is_missing = .data$is_missing, + is_zero = .data$is_zero, + is_positive = .data$is_positive + ) + rhs <- dplyr::distinct(rhs, ADM2_ID, Indicator, Date, .keep_all = TRUE) + dplyr::left_join( + dplyr::mutate(adm_date_grid, Indicator = ind), + rhs, + by = c("ADM2_ID", "Indicator", "Date") + ) %>% + dplyr::mutate( + is_missing = tidyr::replace_na(.data$is_missing, TRUE), + is_zero = tidyr::replace_na(.data$is_zero, FALSE), + is_positive = tidyr::replace_na(.data$is_positive, FALSE) + ) %>% + dplyr::group_by(.data$Indicator, .data$Date) %>% + dplyr::summarise( + n_total = dplyr::n_distinct(.data$ADM2_ID), + n_missing = sum(.data$is_missing, na.rm = TRUE), + n_zero = sum(.data$is_zero & !.data$is_missing, na.rm = TRUE), + n_positive = sum(.data$is_positive, na.rm = TRUE), + pct_missing = dplyr::if_else(.data$n_total > 0L, 100 * .data$n_missing / .data$n_total, 0), + pct_zero = dplyr::if_else(.data$n_total > 0L, 100 * .data$n_zero / .data$n_total, 0), + pct_positive = dplyr::if_else(.data$n_total > 0L, 100 * .data$n_positive / .data$n_total, 0), + .groups = "drop" + ) + }) + + if (nrow(out) == 0L) { + return(dplyr::tibble( + Indicator = character(), + Date = as.Date(character()), + n_total = integer(), + n_missing = integer(), + n_zero = integer(), + n_positive = integer(), + pct_missing = numeric(), + pct_zero = numeric(), + pct_positive = numeric() + )) + } + dplyr::arrange(out, .data$Indicator, .data$Date) +} + + +# ---- Bootstrap notebook report + chargements parquet ---- + +.report_param_or_default <- function(name, default) { + if (!exists(name, envir = .GlobalEnv, inherits = FALSE)) { + return(default) + } + value <- get(name, envir = .GlobalEnv, inherits = FALSE) + if (length(value) == 0L || all(is.na(value))) { + return(default) + } + value +} + + +formatting_report_build_paths <- function(snt_root_path = "~/workspace") { + root <- path.expand(as.character(snt_root_path)[1]) + pipeline_path <- file.path(root, "pipelines", "snt_dhis2_formatting") + reporting_path <- file.path(pipeline_path, "reporting") + figures_dir <- file.path(reporting_path, "outputs", "figures") + dir.create(figures_dir, recursive = TRUE, showWarnings = FALSE) + list( + SNT_ROOT_PATH = root, + CODE_PATH = file.path(root, "code"), + CONFIG_PATH = file.path(root, "configuration"), + PIPELINE_PATH = pipeline_path, + REPORTING_NB_PATH = reporting_path, + figures_dir = figures_dir + ) +} + + +formatting_report_read_snt_config <- function(config_path) { + tryCatch( + jsonlite::fromJSON(file.path(config_path, "SNT_config.json")), + error = function(e) { + stop("Erreur chargement configuration : ", conditionMessage(e)) + } + ) +} + + +formatting_report_assign_ids_from_config <- function(config_json) { + list( + dataset_name = config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED, + COUNTRY_CODE = config_json$SNT_CONFIG$COUNTRY_CODE, + COUNTRY_NAME = config_json$SNT_CONFIG$COUNTRY_NAME, + ADM_2 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2) + ) +} + + +# Bootstrap unique du report (paths + packages + SDK + config + IDs). +snt_setup_report <- function( + snt_root_path = "~/workspace", + report_max_years = .report_param_or_default("REPORT_MAX_YEARS", 10L), + report_plot_months = .report_param_or_default("REPORT_PLOT_MONTHS", 36L), + report_max_indicators = .report_param_or_default("REPORT_MAX_INDICATORS", 40L), + report_pop_scatter_max_rows = .report_param_or_default("REPORT_POP_SCATTER_MAX_ROWS", 4000L), + report_shape_simplify_tol = .report_param_or_default("REPORT_SHAPE_SIMPLIFY_TOL", 0.002), + report_fig_dpi = .report_param_or_default("REPORT_FIG_DPI", 120L), + required_packages = c( + "dplyr", "tidyr", "ggplot2", "forcats", "lubridate", "stringr", "purrr", "rlang", + "scales", "arrow", "sf", "reticulate", "patchwork", "jsonlite", "httr", "IRdisplay" + ) +) { + paths <- formatting_report_build_paths(snt_root_path) + source(file.path(paths$CODE_PATH, "snt_utils.r")) + source(file.path(paths$PIPELINE_PATH, "utils", "snt_dhis2_formatting.r")) + snt_setup( + SNT_ROOT_PATH = paths$SNT_ROOT_PATH, + packages = required_packages + ) + if (Sys.getenv("SNT_REPORT_PLOT_MAX_ROWS", unset = "") == "") { + Sys.setenv(SNT_REPORT_PLOT_MAX_ROWS = "800000") + } + config_json <- formatting_report_read_snt_config(paths$CONFIG_PATH) + ids <- formatting_report_assign_ids_from_config(config_json) + sdk_available <- exists("openhexa", inherits = TRUE) && !is.null(get("openhexa", inherits = TRUE)) + if (!sdk_available) { + warning("OpenHEXA SDK is not loaded or available.") + } + c( + paths, + list( + REPORT_MAX_YEARS = suppressWarnings(as.integer(report_max_years)[1]), + REPORT_PLOT_MONTHS = suppressWarnings(as.integer(report_plot_months)[1]), + REPORT_MAX_INDICATORS = suppressWarnings(as.integer(report_max_indicators)[1]), + REPORT_POP_SCATTER_MAX_ROWS = suppressWarnings(as.integer(report_pop_scatter_max_rows)[1]), + REPORT_SHAPE_SIMPLIFY_TOL = suppressWarnings(as.numeric(report_shape_simplify_tol)[1]), + REPORT_FIG_DPI = suppressWarnings(as.integer(report_fig_dpi)[1]), + config_json = config_json, + openhexa_available = sdk_available + ), + ids + ) +} + + +# Simplifie les shapes et retourne l'objet transforme. +formatting_report_simplify_shapes <- function(shapes_data, simplify_tol = 0.002) { + if (is.null(shapes_data)) stop("formatting_report_simplify_shapes: shapes_data is required.") + tol <- suppressWarnings(as.numeric(simplify_tol)[1]) + if (!is.finite(tol)) stop("formatting_report_simplify_shapes: simplify_tol must be numeric.") + sf::st_simplify(shapes_data, dTolerance = tol, preserveTopology = TRUE) +} + + +# Routine parquet + filtre annees + printdim. +formatting_report_load_routine_data <- function(setup) { + routine_data <- load_dataset_file( + setup$dataset_name, + paste0(setup$COUNTRY_CODE, "_routine.parquet"), + verbose = TRUE + ) + max_years <- suppressWarnings(as.integer(setup$REPORT_MAX_YEARS)[1]) + if (is.finite(max_years) && max_years > 0L && "YEAR" %in% names(routine_data)) { + y_end <- suppressWarnings(max(as.numeric(routine_data$YEAR), na.rm = TRUE)) + routine_data <- dplyr::filter( + routine_data, + suppressWarnings(as.numeric(YEAR)) > y_end - max_years + ) + } + printdim(routine_data) + routine_data +} + + +# Population parquet + printdim. +formatting_report_load_population_data <- function(setup) { + population_data <- load_dataset_file( + setup$dataset_name, + paste0(setup$COUNTRY_CODE, "_population.parquet"), + verbose = TRUE + ) + printdim(population_data) + population_data +} + + +# Shapes geojson + printdim. +formatting_report_load_shapes_data <- function(setup) { + shapes_data <- load_dataset_file( + setup$dataset_name, + paste0(setup$COUNTRY_CODE, "_shapes.geojson"), + verbose = TRUE + ) + printdim(shapes_data) + shapes_data +}