Build survey_tools and automated_reporting modules

claude · claude · commit 91ba5a485b2b · 2026-03-15T03:25:53.000Z
- sample_size_calculator.R, sampling_weights.R, survey_summary.R
- render_reports.R batch renderer, monthly_summary.qmd Quarto template
- Updated README with detailed module documentation
- Updated CHANGELOG
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,26 @@
 # Changelog
 
+## [v1.2.0] - 2025-04-28
+
+### Added
+- `survey_tools/` module with 3 R scripts:
+  - `sample_size_calculator.R` — simple, stratified, and cluster sampling with design effect estimation
+  - `sampling_weights.R` — base weight calculation, trimming, and weighted summary statistics
+  - `survey_summary.R` — complex survey design objects with weighted descriptives and proportions
+- `automated_reporting/` module with 2 files:
+  - `render_reports.R` — batch rendering for Quarto and RMarkdown with output logging
+  - `monthly_summary.qmd` — Quarto template for monthly indicator summary with inline visualizations
+
+### Improved
+- README updated with detailed survey_tools and automated_reporting documentation
+- Survey tools and automated reporting sections expanded from stubs to full descriptions
+
+## [v1.1.0] - 2025-04-24
+
+### Improved
+- Full repo cleanup: removed 14 placeholder files, updated docs, added .github config
+- README rewritten with honest content listing and ecosystem links
+
 ## [v1.0.0](https://github.com/Varnasr/FieldStack/tree/v1.0.0) (2025-04-19)
 
 [Full Changelog](https://github.com/Varnasr/FieldStack/compare/954b918bc01299272877fe2d2b65194fcf7a7eed...v1.0.0)
diff --git a/README.md b/README.md
@@ -38,13 +38,26 @@ This is the **applied research layer** of [OpenStacks for Change](https://openst
 | `codebook_templates/` | Variable metadata for health surveys and programme monitoring |
 | `tests/` | 9 testthat unit tests covering all core functions |
 
+### Survey Tools
+
+| Script | What It Does |
+|--------|-------------|
+| `survey_tools/sample_size_calculator.R` | Simple, stratified, and cluster sampling calculations with design effect |
+| `survey_tools/sampling_weights.R` | Base weight calculation, trimming, weighted summary statistics |
+| `survey_tools/survey_summary.R` | Survey design objects, weighted descriptives, proportions using the `survey` package |
+
+### Automated Reporting
+
+| File | What It Does |
+|------|-------------|
+| `automated_reporting/render_reports.R` | Batch Quarto/RMarkdown rendering with logging |
+| `automated_reporting/monthly_summary.qmd` | Quarto template for monthly indicator summary with inline plots |
+
 ### Supporting
 
 | Directory | What It Contains |
 |-----------|-----------------|
 | `python_integration/` | R-Python interop via reticulate |
-| `survey_tools/` | Survey data utilities |
-| `automated_reporting/` | Report generation workflows |
 
 ## Getting Started
 
diff --git a/automated_reporting/README.md b/automated_reporting/README.md
@@ -1,9 +1,17 @@
 # Automated Reporting
 
-This folder contains example scripts and templates for generating automated reports using Quarto and R. 
-You can batch-render `.qmd` files to PDF/HTML using the `rmarkdown::render()` or `quarto::quarto_render()` commands.
+R scripts and Quarto templates for generating automated field reports.
+
+## Contents
+
+| File | Purpose |
+|------|---------|
+| `render_reports.R` | Batch render Quarto notebooks to HTML/PDF |
+| `monthly_summary.qmd` | Template for monthly indicator summary report |
+
+## Usage
 
-Example:
 ```r
-quarto::quarto_render("monthly_summary.qmd", output_format = "html")
-```
+source("automated_reporting/render_reports.R")
+render_all_notebooks("../notebooks/")
+```
diff --git a/automated_reporting/monthly_summary.qmd b/automated_reporting/monthly_summary.qmd
@@ -0,0 +1,54 @@
+---
+title: "Monthly Indicator Summary"
+format: html
+date: today
+params:
+  data_path: "../sample_data/mel_indicators_wide.csv"
+---
+
+```{r setup, include=FALSE}
+library(tidyverse)
+knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
+```
+
+## Data Overview
+
+```{r load-data}
+df <- read_csv(params$data_path, show_col_types = FALSE)
+cat(sprintf("Records: %d | Variables: %d\n", nrow(df), ncol(df)))
+```
+
+## Summary Statistics
+
+```{r summary}
+df %>%
+  select(where(is.numeric)) %>%
+  pivot_longer(everything(), names_to = "indicator", values_to = "value") %>%
+  group_by(indicator) %>%
+  summarise(
+    n = sum(!is.na(value)),
+    mean = round(mean(value, na.rm = TRUE), 1),
+    median = round(median(value, na.rm = TRUE), 1),
+    min = round(min(value, na.rm = TRUE), 1),
+    max = round(max(value, na.rm = TRUE), 1),
+    .groups = "drop"
+  ) %>%
+  knitr::kable()
+```
+
+## Distribution
+
+```{r plot, fig.width=8, fig.height=4}
+df %>%
+  select(where(is.numeric)) %>%
+  pivot_longer(everything(), names_to = "indicator", values_to = "value") %>%
+  ggplot(aes(x = value)) +
+  geom_histogram(bins = 20, fill = "#1a56db", alpha = 0.7) +
+  facet_wrap(~indicator, scales = "free") +
+  theme_minimal() +
+  labs(title = "Indicator Distributions", x = NULL, y = "Count")
+```
+
+---
+
+*Generated automatically by FieldStack automated reporting.*
diff --git a/automated_reporting/render_reports.R b/automated_reporting/render_reports.R
@@ -0,0 +1,78 @@
+# Automated Report Rendering
+# Batch render Quarto/R Markdown notebooks to HTML or PDF
+
+library(tidyverse)
+
+#' Render a single Quarto document
+#' @param input Path to .qmd file
+#' @param output_format "html" or "pdf"
+#' @param output_dir Output directory (default: same as input)
+render_notebook <- function(input, output_format = "html", output_dir = NULL) {
+  if (!file.exists(input)) {
+    warning(paste("File not found:", input))
+    return(invisible(NULL))
+  }
+
+  if (requireNamespace("quarto", quietly = TRUE)) {
+    quarto::quarto_render(input, output_format = output_format, output_file = output_dir)
+  } else if (requireNamespace("rmarkdown", quietly = TRUE)) {
+    rmarkdown::render(input, output_format = paste0(output_format, "_document"),
+                       output_dir = output_dir)
+  } else {
+    stop("Install quarto or rmarkdown package")
+  }
+  cat(sprintf("Rendered: %s -> %s\n", input, output_format))
+}
+
+#' Batch render all .qmd files in a directory
+#' @param dir Directory to search for .qmd files
+#' @param output_format "html" or "pdf"
+#' @param output_dir Output directory for rendered files
+#' @param recursive Search subdirectories
+render_all_notebooks <- function(dir = ".", output_format = "html",
+                                   output_dir = NULL, recursive = FALSE) {
+  files <- list.files(dir, pattern = "\\.qmd$", full.names = TRUE, recursive = recursive)
+
+  if (length(files) == 0) {
+    cat("No .qmd files found in", dir, "\n")
+    return(invisible(NULL))
+  }
+
+  cat(sprintf("Found %d notebook(s) to render:\n", length(files)))
+  results <- lapply(files, function(f) {
+    tryCatch({
+      render_notebook(f, output_format, output_dir)
+      data.frame(file = f, status = "success", stringsAsFactors = FALSE)
+    }, error = function(e) {
+      cat(sprintf("  FAILED: %s (%s)\n", f, e$message))
+      data.frame(file = f, status = paste("failed:", e$message), stringsAsFactors = FALSE)
+    })
+  })
+
+  bind_rows(results)
+}
+
+#' Generate a summary table from rendered reports
+#' @param results Output from render_all_notebooks
+#' @return Summary data frame
+report_summary <- function(results) {
+  results %>%
+    mutate(
+      basename = basename(file),
+      rendered = status == "success"
+    ) %>%
+    summarise(
+      total = n(),
+      rendered = sum(rendered),
+      failed = sum(!rendered)
+    )
+}
+
+# Example usage
+if (sys.nframe() == 0) {
+  cat("=== Automated Report Renderer ===\n")
+  cat("Usage:\n")
+  cat("  source('automated_reporting/render_reports.R')\n")
+  cat("  render_all_notebooks('../notebooks/', output_format = 'html')\n")
+  cat("  render_notebook('path/to/notebook.qmd', 'pdf')\n")
+}
diff --git a/survey_tools/README.md b/survey_tools/README.md
@@ -1,3 +1,24 @@
 # Survey Tools
 
-Survey-weighted summaries and wrappers using `srvyr`, `survey`, and related packages.
+R functions for survey design and analysis using the `survey` and `srvyr` packages.
+
+## Contents
+
+| Script | Purpose |
+|--------|---------|
+| `sample_size_calculator.R` | Calculate sample sizes for simple, stratified, and cluster designs |
+| `sampling_weights.R` | Compute and apply survey weights for complex designs |
+| `survey_summary.R` | Weighted descriptive statistics with confidence intervals |
+
+## Usage
+
+```r
+source("survey_tools/sample_size_calculator.R")
+sample_size_simple(p = 0.5, margin = 0.05, confidence = 0.95)
+```
+
+## Requirements
+
+- R 4.0+
+- `tidyverse`
+- `survey` (for `survey_summary.R`)
diff --git a/survey_tools/sample_size_calculator.R b/survey_tools/sample_size_calculator.R
@@ -0,0 +1,106 @@
+# Sample Size Calculator for Survey Design
+# Covers simple random, stratified, and cluster sampling designs
+
+#' Calculate sample size for simple random sampling
+#' @param p Expected proportion (default 0.5 for maximum variance)
+#' @param margin Margin of error (default 0.05)
+#' @param confidence Confidence level (default 0.95)
+#' @param population Population size (NULL for infinite)
+#' @return Required sample size
+sample_size_simple <- function(p = 0.5, margin = 0.05, confidence = 0.95, population = NULL) {
+  z <- qnorm(1 - (1 - confidence) / 2)
+  n <- (z^2 * p * (1 - p)) / margin^2
+
+  # Finite population correction
+  if (!is.null(population)) {
+    n <- n / (1 + (n - 1) / population)
+  }
+  ceiling(n)
+}
+
+#' Calculate sample size for stratified sampling
+#' @param strata_sizes Vector of stratum population sizes
+#' @param strata_proportions Expected proportion per stratum
+#' @param margin Margin of error
+#' @param confidence Confidence level
+#' @param allocation Allocation method: "proportional" or "equal"
+#' @return Data frame with stratum-wise sample sizes
+sample_size_stratified <- function(strata_sizes, strata_proportions = NULL,
+                                     margin = 0.05, confidence = 0.95,
+                                     allocation = "proportional") {
+  k <- length(strata_sizes)
+  if (is.null(strata_proportions)) strata_proportions <- rep(0.5, k)
+
+  N <- sum(strata_sizes)
+  z <- qnorm(1 - (1 - confidence) / 2)
+
+  # Total sample size
+  n_total <- sample_size_simple(p = 0.5, margin = margin, confidence = confidence, population = N)
+
+  # Allocate across strata
+  if (allocation == "proportional") {
+    weights <- strata_sizes / N
+  } else {
+    weights <- rep(1 / k, k)
+  }
+
+  n_strata <- ceiling(n_total * weights)
+
+  data.frame(
+    stratum = seq_len(k),
+    population = strata_sizes,
+    proportion = strata_proportions,
+    sample_size = n_strata
+  )
+}
+
+#' Calculate sample size for cluster randomized designs
+#' @param icc Intra-cluster correlation coefficient
+#' @param cluster_size Average number of units per cluster
+#' @param p Expected proportion
+#' @param margin Margin of error
+#' @param confidence Confidence level
+#' @return List with design effect, effective sample size, and clusters needed
+sample_size_cluster <- function(icc = 0.05, cluster_size = 30,
+                                  p = 0.5, margin = 0.05, confidence = 0.95) {
+  # Design effect
+  deff <- 1 + (cluster_size - 1) * icc
+
+  # Simple sample size
+  n_simple <- sample_size_simple(p = p, margin = margin, confidence = confidence)
+
+  # Adjusted for clustering
+  n_effective <- ceiling(n_simple * deff)
+  n_clusters <- ceiling(n_effective / cluster_size)
+
+  list(
+    design_effect = round(deff, 2),
+    simple_sample_size = n_simple,
+    effective_sample_size = n_effective,
+    clusters_needed = n_clusters,
+    total_sample = n_clusters * cluster_size,
+    icc = icc,
+    cluster_size = cluster_size
+  )
+}
+
+# Example usage
+if (sys.nframe() == 0) {
+  cat("=== Simple Random Sampling ===\n")
+  cat(sprintf("50%% proportion, 5%% margin: n = %d\n", sample_size_simple()))
+  cat(sprintf("30%% proportion, 3%% margin: n = %d\n", sample_size_simple(p = 0.3, margin = 0.03)))
+  cat(sprintf("With population 10000: n = %d\n", sample_size_simple(population = 10000)))
+
+  cat("\n=== Stratified Sampling ===\n")
+  strat <- sample_size_stratified(
+    strata_sizes = c(5000, 3000, 2000),
+    allocation = "proportional"
+  )
+  print(strat)
+
+  cat("\n=== Cluster Sampling ===\n")
+  cluster <- sample_size_cluster(icc = 0.05, cluster_size = 30)
+  cat(sprintf("Design effect: %.2f\n", cluster$design_effect))
+  cat(sprintf("Clusters needed: %d\n", cluster$clusters_needed))
+  cat(sprintf("Total sample: %d (vs %d simple)\n", cluster$total_sample, cluster$simple_sample_size))
+}
diff --git a/survey_tools/sampling_weights.R b/survey_tools/sampling_weights.R
diff --git a/survey_tools/survey_summary.R b/survey_tools/survey_summary.R