Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,15 @@
"cell_type": "code",
"execution_count": null,
"id": "52fae14b-d3bc-4a74-9c36-89e854e70636",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"source(file.path(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r\"))\n",
"setup_var <- get_setup_variables(packages= c(\"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\"))\n",
"source(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r\")\n",
"setup_var <- snt_setup(SNT_ROOT_PATH = \"~/workspace\", packages = c(\"arrow\", \"dplyr\", \"tidyr\", \"stringr\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\"))\n",
"\n",
"# Load config\n",
"config_json <- load_snt_config(file.path(setup_var$CONFIG_PATH, \"SNT_config.json\"))\n",
Expand Down Expand Up @@ -49,7 +53,11 @@
"cell_type": "code",
"execution_count": null,
"id": "6febcacc-afff-4cbb-ac0d-bc2bb00edff4",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"dhis2_pyramid_data <- load_dataset_file(extracts_dataset_id, paste0(COUNTRY_CODE, \"_dhis2_raw_pyramid.parquet\"), verbose=FALSE)\n",
Expand Down Expand Up @@ -77,7 +85,11 @@
"cell_type": "code",
"execution_count": null,
"id": "6b1cda01-c507-4fb4-85d2-1a568df219dc",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"dhis2_data <- load_dataset_file(extracts_dataset_id, paste0(COUNTRY_CODE, \"_dhis2_raw_population.parquet\"), verbose=FALSE)\n",
Expand All @@ -100,7 +112,11 @@
"cell_type": "code",
"execution_count": null,
"id": "6e3f488b-95f1-4aff-b024-cf4e771fed63",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"population_table <- build_population_indicators(dhis2_data, dhis2_pyramid_data, config_json)\n",
Expand All @@ -123,7 +139,11 @@
"cell_type": "code",
"execution_count": null,
"id": "34713812-ca2f-452b-8a15-6d471409f5ff",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"admin_cols <- get_admin_config(config_json)\n",
Expand All @@ -146,7 +166,11 @@
"cell_type": "code",
"execution_count": null,
"id": "a32f4a3b-e9f1-4941-b2a2-80b4fa4eb2de",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# Create pop template (util for pop transformation pipeline)\n",
Expand Down Expand Up @@ -191,7 +215,11 @@
"cell_type": "code",
"execution_count": null,
"id": "59891b6e-490e-4a22-8909-416520d168da",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# save template\n",
Expand Down
195 changes: 105 additions & 90 deletions pipelines/snt_dhis2_formatting/code/snt_dhis2_formatting_pyramid.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,15 @@
"cell_type": "code",
"execution_count": null,
"id": "fca7918b-f8bf-4e39-9601-febf5ca7877a",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"source(\"~/workspace/pipelines/snt_dhis2_formatting/utils/snt_dhis2_formatting.r\")\n",
"setup_var <- get_setup_variables(packages= c(\"lubridate\", \"zoo\", \"arrow\", \"dplyr\", \"stringi\", \"stringr\", \"jsonlite\", \"httr\", \"glue\"))\n",
"setup_var <- snt_setup(SNT_ROOT_PATH = \"~/workspace\", packages = c(\"lubridate\", \"zoo\", \"arrow\", \"dplyr\", \"stringi\", \"stringr\", \"jsonlite\", \"httr\", \"glue\"))\n",
"\n",
"# Load config\n",
"config_json <- load_snt_config(file.path(setup_var$CONFIG_PATH, \"SNT_config.json\"))\n",
Expand Down Expand Up @@ -67,7 +71,11 @@
"cell_type": "code",
"execution_count": null,
"id": "8faec80e-2b3e-4162-8767-d6c5c28eadcf",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"log_msg(glue(\"Start DHIS2 organisation units(pyramid) formatting.\")) \n",
Expand All @@ -89,7 +97,11 @@
"cell_type": "code",
"execution_count": null,
"id": "076dacc5-f449-4ab1-9168-9dbc766d295f",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# Extract lon/lat from geometry\n",
Expand All @@ -102,95 +114,97 @@
"id": "c25113ed-090a-418c-9a5c-2b30e0f2f773",
"metadata": {},
"source": [
"### Try coordinates validation steps"
"### Coordinate validation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43d9f9dc-97fc-4083-bc17-e78d0037e02d",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# Step 1 - Try Load country border from folder (if exists)\n",
"shapes_sf <- read_geojson_safe(file.path(\"~/workspace/data/dhis2/extracts_formatted/\" , paste0(COUNTRY_CODE, \"_shapes.geojson\")))"
"shapes_sf <- read_geojson_safe(file.path(\"~/workspace/data/dhis2/extracts_formatted/\", paste0(COUNTRY_CODE, \"_shapes.geojson\")))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f8d11c71-0921-4a1e-b005-737b18392383",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# Step 2 - Keep original coordinates already inside the country (If shapes are available)\n",
"log_msg(\"Running coordinate boundary validation\")\n",
"\n",
"if (!is.null(shapes_sf)) {\n",
" log_msg(\"Running coordinate boundary validation\")\n",
" \n",
" shapes_sf_boundary <- prepare_country_boundary(shapes_sf)\n",
" lon0 <- pyramid_data_coords$LONGITUDE\n",
" lat0 <- pyramid_data_coords$LATITUDE\n",
" within_original <- points_within_country_batch(lon0, lat0, shapes_sf_boundary)\n",
" has_coords <- !is.na(lon0) & !is.na(lat0)\n",
" \n",
" coord_fix_df <- tibble(\n",
" LONGITUDE_ORIGINAL = lon0,\n",
" LATITUDE_ORIGINAL = lat0,\n",
" LONGITUDE_FIXED = NA_real_,\n",
" LATITUDE_FIXED = NA_real_,\n",
" COORD_FIX_METHOD = NA_character_,\n",
" COORD_IS_VALID = FALSE\n",
" )\n",
" \n",
" ok_original <- has_coords & within_original \n",
" \n",
" if (any(ok_original)) {\n",
" coord_fix_df$LONGITUDE_FIXED[ok_original] <- lon0[ok_original]\n",
" coord_fix_df$LATITUDE_FIXED[ok_original] <- lat0[ok_original]\n",
" coord_fix_df$COORD_FIX_METHOD[ok_original] <- \"ORIGINAL\"\n",
" coord_fix_df$COORD_IS_VALID[ok_original] <- TRUE\n",
" }\n",
"} else {\n",
" log_msg(\"Skipped coordinate boundary validation: No reference shapes available.\")\n",
"shapes_sf_boundary <- prepare_country_boundary(shapes_sf)\n",
"lon0 <- pyramid_data_coords$LONGITUDE\n",
"lat0 <- pyramid_data_coords$LATITUDE\n",
"within_original <- points_within_country_batch(lon0, lat0, shapes_sf_boundary)\n",
"has_coords <- !is.na(lon0) & !is.na(lat0)\n",
"\n",
"coord_fix_df <- tibble(\n",
" LONGITUDE_ORIGINAL = lon0,\n",
" LATITUDE_ORIGINAL = lat0,\n",
" LONGITUDE_FIXED = NA_real_,\n",
" LATITUDE_FIXED = NA_real_,\n",
" COORD_FIX_METHOD = NA_character_,\n",
" COORD_IS_VALID = FALSE\n",
")\n",
"\n",
"ok_original <- has_coords & within_original\n",
"\n",
"if (any(ok_original)) {\n",
" coord_fix_df$LONGITUDE_FIXED[ok_original] <- lon0[ok_original]\n",
" coord_fix_df$LATITUDE_FIXED[ok_original] <- lat0[ok_original]\n",
" coord_fix_df$COORD_FIX_METHOD[ok_original] <- \"ORIGINAL\"\n",
" coord_fix_df$COORD_IS_VALID[ok_original] <- TRUE\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1cea34fe-cc7c-4116-b018-5b8c895bb980",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# Step 3 - For remaining points, try correction sequence\n",
"if (!is.null(shapes_sf)) {\n",
" \n",
" miss <- !has_coords\n",
" if (any(miss)) {\n",
" coord_fix_df$COORD_FIX_METHOD[miss] <- \"MISSING_COORDINATES\"\n",
" }\n",
"miss <- !has_coords\n",
"if (any(miss)) {\n",
" coord_fix_df$COORD_FIX_METHOD[miss] <- \"MISSING_COORDINATES\"\n",
"}\n",
"\n",
"need_fix <- !within_original & has_coords\n",
"log_msg(glue(\"Found {sum(need_fix)} / {length(need_fix)} coordinates that require fixing.\"))\n",
"\n",
"fix_results <- list()\n",
"if (any(need_fix)) {\n",
" idx_fix <- which(need_fix)\n",
" fix_results <- lapply(idx_fix, function(i) {\n",
" fix_coordinate_pair_in_country(lon0[i], lat0[i], shapes_sf_boundary, max_shift = 2)\n",
" })\n",
"\n",
" need_fix <- !within_original & has_coords \n",
" log_msg(glue(\"Found {sum(need_fix)} / {length(need_fix)} coordinates that require fixing.\"))\n",
" \n",
" if (any(need_fix)) {\n",
" idx_fix <- which(need_fix)\n",
" fix_results <- lapply(idx_fix, function(i) {\n",
" fix_coordinate_pair_in_country(lon0[i], lat0[i], shapes_sf_boundary, max_shift = 2)\n",
" })\n",
" \n",
" fixed_coords <- sum(sapply(fix_results, function(x) x$VALID == TRUE), na.rm = TRUE) \n",
" fixed_coords <- sum(sapply(fix_results, function(x) x$VALID == TRUE), na.rm = TRUE)\n",
" log_msg(glue(\"Points corrected: {fixed_coords} out of {sum(need_fix)}\"))\n",
" \n",
"\n",
" for (k in seq_along(idx_fix)) {\n",
" i <- idx_fix[k]\n",
" fr <- fix_results[[k]]\n",
" coord_fix_df$LONGITUDE_FIXED[i] <- fr$LONGITUDE\n",
" coord_fix_df$LATITUDE_FIXED[i] <- fr$LATITUDE\n",
" coord_fix_df$COORD_FIX_METHOD[i] <- fr$METHOD\n",
" coord_fix_df$COORD_IS_VALID[i] <- fr$VALID\n",
" }\n",
" }\n",
"}"
]
Expand All @@ -199,48 +213,49 @@
"cell_type": "code",
"execution_count": null,
"id": "eff4aff7-fe8d-4d47-b87f-b3fc8512d10e",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# Display fixed points (if any)\n",
"if (exists(\"fixed_coords\") && length(fixed_coords) > 0) {\n",
" my_map <- plot_fixed_coordinates(fix_results, shapes_sf_boundary) \n",
"} "
"if (length(fix_results) > 0) {\n",
" my_map <- plot_fixed_coordinates(fix_results, shapes_sf_boundary)\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "49ef06a2-072d-4296-a220-6d5ab1948db0",
"metadata": {},
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# Step 6 - Apply final coordinates and flag invalids\n",
"if (!is.null(shapes_sf)) {\n",
" pyramid_data_coords$LONGITUDE <- coord_fix_df$LONGITUDE_FIXED\n",
" pyramid_data_coords$LATITUDE <- coord_fix_df$LATITUDE_FIXED\n",
" \n",
" invalid_coords <- pyramid_data_coords %>%\n",
" bind_cols(coord_fix_df %>% select(LONGITUDE_ORIGINAL, LATITUDE_ORIGINAL, COORD_FIX_METHOD, COORD_IS_VALID)) %>%\n",
" filter(!COORD_IS_VALID & !is.na(LONGITUDE_ORIGINAL) & !is.na(LATITUDE_ORIGINAL)) %>%\n",
" mutate(INVALID_COORD_REASON = \"NO_VALID_TRANSFORMATION_IN_COUNTRY\") %>% \n",
" select(-LONGITUDE, -LATITUDE)\n",
" \n",
" # Step 7 - Summary logs\n",
" n_total_coords <- sum(!is.na(coord_fix_df$LONGITUDE_ORIGINAL) & !is.na(coord_fix_df$LATITUDE_ORIGINAL))\n",
" n_kept_original <- sum(coord_fix_df$COORD_FIX_METHOD == \"ORIGINAL\", na.rm = TRUE)\n",
" n_corrected <- sum(coord_fix_df$COORD_IS_VALID & coord_fix_df$COORD_FIX_METHOD != \"ORIGINAL\", na.rm = TRUE)\n",
" n_invalid <- nrow(invalid_coords)\n",
" \n",
" log_msg(glue(\"Coordinate quality check over {n_total_coords} FOSAs: original valid={n_kept_original}, corrected={n_corrected}, invalid={n_invalid}.\"))\n",
" if (n_corrected > 0) {\n",
" log_msg(glue(\"Applied coordinate correction algorithm to {n_corrected} FOSAs (swap/sign/decimal left-to-right, k<=2).\"), \"warning\")\n",
" }\n",
" if (n_invalid > 0) {\n",
" log_msg(glue(\"{n_invalid} FOSAs remain invalid after correction attempts. LONGITUDE/LATITUDE set to NA.\"), \"warning\")\n",
" }\n",
"} else {\n",
" invalid_coords <- c()\n",
"pyramid_data_coords$LONGITUDE <- coord_fix_df$LONGITUDE_FIXED\n",
"pyramid_data_coords$LATITUDE <- coord_fix_df$LATITUDE_FIXED\n",
"\n",
"invalid_coords <- pyramid_data_coords %>%\n",
" bind_cols(coord_fix_df %>% select(LONGITUDE_ORIGINAL, LATITUDE_ORIGINAL, COORD_FIX_METHOD, COORD_IS_VALID)) %>%\n",
" filter(!COORD_IS_VALID & !is.na(LONGITUDE_ORIGINAL) & !is.na(LATITUDE_ORIGINAL)) %>%\n",
" mutate(INVALID_COORD_REASON = \"NO_VALID_TRANSFORMATION_IN_COUNTRY\") %>%\n",
" select(-LONGITUDE, -LATITUDE)\n",
"\n",
"n_total_coords <- sum(!is.na(coord_fix_df$LONGITUDE_ORIGINAL) & !is.na(coord_fix_df$LATITUDE_ORIGINAL))\n",
"n_kept_original <- sum(coord_fix_df$COORD_FIX_METHOD == \"ORIGINAL\", na.rm = TRUE)\n",
"n_corrected <- sum(coord_fix_df$COORD_IS_VALID & coord_fix_df$COORD_FIX_METHOD != \"ORIGINAL\", na.rm = TRUE)\n",
"n_invalid <- nrow(invalid_coords)\n",
"\n",
"log_msg(glue(\"Coordinate quality check over {n_total_coords} FOSAs: original valid={n_kept_original}, corrected={n_corrected}, invalid={n_invalid}.\"))\n",
"if (n_corrected > 0) {\n",
" log_msg(glue(\"Applied coordinate correction algorithm to {n_corrected} FOSAs (swap/sign/decimal left-to-right, k<=2).\"), \"warning\")\n",
"}\n",
"if (n_invalid > 0) {\n",
" log_msg(glue(\"{n_invalid} FOSAs remain invalid after correction attempts. LONGITUDE/LATITUDE set to NA.\"), \"warning\")\n",
"}\n",
"\n",
"head(pyramid_data_coords, 3)"
Expand Down
Loading