Skip to content

Validation Chat with MGA #22

@Zhenglei-BCS

Description

@Zhenglei-BCS

We can generalize my Dunnett framework into a modular validator that supports multiple test types (Williams, Welch/Student t, Wilcoxon, Kruskal–Wallis + Dunn, Fisher, Shapiro/Levene), each with its own discovery, execution, and metric mapping. Below are concise, practical code suggestions and targeted R commands I need you to run so I can tailor the handlers precisely.

What we’ll reuse

  • Discovery: build_dunnett_fgs pattern, generalized to build_generic_fgs(pattern).
  • Helpers: convert_dose, convert_numeric, dose_from_comparison, normalize_alternative.
  • Long-format builder and display filters (Actual/Expected/Status with metadata).

Registry design (extensible)

  • Register each test type with:

    • keyword/pattern(s) to discover expected rows
    • discover_fgs function (FG/Study pairs + alternatives)
    • run_actual function to compute real results for one endpoint
    • metrics list and tolerances
    • comparator type (many_to_one, two_sample, trend, per_group)

Code: extend the registry with stubs you can fill in

# Existing 'dunnett' entry stays the same
test_registry <- list(
  dunnett = list(
    keyword = "Dunnett",
    discover_fgs = function(res, data) build_dunnett_fgs(res, data),
    run_actual = function(endpoint_data, alternative) {
      out <- drcHelper::dunnett_test(
        data = endpoint_data,
        response_var = "Response",
        dose_var = "Dose_numeric",
        include_random_effect = FALSE,
        alternative = alternative
      )
      actual_df <- as.data.frame(out$results_table)
      actual_df$Dose <- dose_from_comparison(as.character(actual_df$comparison))
      names(actual_df)[names(actual_df) == "statistic"] <- "Actual_T"
      names(actual_df)[names(actual_df) == "p.value"]  <- "Actual_P"
      names(actual_df)[names(actual_df) == "estimate"] <- "Actual_Diff"
      rownames(actual_df) <- NULL

      group_means <- endpoint_data %>%
        dplyr::mutate(Dose = convert_dose(Dose)) %>%
        dplyr::filter(!is.na(Dose)) %>%
        dplyr::group_by(Dose) %>%
        dplyr::summarise(Actual_Mean = mean(Response, na.rm = TRUE), .groups = "drop")

      list(actual_df = actual_df, group_means = group_means)
    },
    metrics = c("Mean", "T-value", "P-value"),
    comparator = "many_to_one"
  ),

  # Williams trend test (uses tukeytrend; drcHelper has broom_williams/getwilliamRes)
  williams = list(
    keyword = "Williams",
    discover_fgs = function(res, data) build_generic_fgs(res, data, pattern = "Williams"),
    run_actual = function(endpoint_data, alternative) {
      # Placeholder: will wire once we see the output shapes
      # Likely via drcHelper::broom_williams or drcHelper::getwilliamRes
      stop("Williams handler: please provide structure of broom_williams/getwilliamRes outputs (see commands below).")
    },
    metrics = c("Mean", "T-value", "P-value", "Tcrit", "df", "significance"),
    comparator = "trend"
  ),

  # Student t-test (var.equal = TRUE) many-to-one (dose vs control)
  student_t = list(
    keyword = "Student's t-test",
    discover_fgs = function(res, data) build_generic_fgs(res, data, pattern = "Student's t-test"),
    run_actual = function(endpoint_data, alternative) {
      # Compute per-dose vs control t-tests (unadjusted)
      ed <- endpoint_data %>% dplyr::mutate(Dose = convert_dose(Dose))
      ctrl <- ed %>% dplyr::filter(Dose == min(Dose, na.rm = TRUE)) %>% dplyr::pull(Response)
      by_dose <- ed %>% dplyr::filter(Dose != min(Dose, na.rm = TRUE)) %>% split(.$Dose)

      rows <- lapply(names(by_dose), function(d) {
        trt <- by_dose[[d]]$Response
        tt <- stats::t.test(trt, ctrl, alternative = switch(tolower(alternative),
                                                           "smaller" = "less", "greater" = "greater", "two-sided" = "two.sided"),
                            var.equal = TRUE)
        data.frame(Dose = as.numeric(d), Actual_T = unname(tt$statistic), Actual_P = tt$p.value)
      })
      actual_df <- dplyr::bind_rows(rows)
      group_means <- ed %>% dplyr::group_by(Dose) %>% dplyr::summarise(Actual_Mean = mean(Response, na.rm = TRUE), .groups = "drop")
      list(actual_df = actual_df, group_means = group_means)
    },
    metrics = c("Mean", "T-value", "P-value", "df"),
    comparator = "many_to_one"
  ),

  # Welch t-test (var.equal = FALSE)
  welch = list(
    keyword = "Welch",
    discover_fgs = function(res, data) build_generic_fgs(res, data, pattern = "Welch"),
    run_actual = function(endpoint_data, alternative) {
      ed <- endpoint_data %>% dplyr::mutate(Dose = convert_dose(Dose))
      ctrl <- ed %>% dplyr::filter(Dose == min(Dose, na.rm = TRUE)) %>% dplyr::pull(Response)
      by_dose <- ed %>% dplyr::filter(Dose != min(Dose, na.rm = TRUE)) %>% split(.$Dose)
      rows <- lapply(names(by_dose), function(d) {
        trt <- by_dose[[d]]$Response
        tt <- stats::t.test(trt, ctrl, alternative = switch(tolower(alternative),
                                                           "smaller" = "less", "greater" = "greater", "two-sided" = "two.sided"),
                            var.equal = FALSE)
        data.frame(Dose = as.numeric(d), Actual_T = unname(tt$statistic), Actual_P = tt$p.value)
      })
      actual_df <- dplyr::bind_rows(rows)
      group_means <- ed %>% dplyr::group_by(Dose) %>% dplyr::summarise(Actual_Mean = mean(Response, na.rm = TRUE), .groups = "drop")
      list(actual_df = actual_df, group_means = group_means)
    },
    metrics = c("Mean", "T-value", "P-value", "df"),
    comparator = "many_to_one"
  ),

  # Wilcoxon (Mann–Whitney) per dose vs control
  wilcoxon = list(
    keyword = "Wilcoxon test",
    discover_fgs = function(res, data) build_generic_fgs(res, data, pattern = "Wilcoxon test"),
    run_actual = function(endpoint_data, alternative) {
      ed <- endpoint_data %>% dplyr::mutate(Dose = convert_dose(Dose))
      ctrl <- ed %>% dplyr::filter(Dose == min(Dose, na.rm = TRUE)) %>% dplyr::pull(Response)
      by_dose <- ed %>% dplyr::filter(Dose != min(Dose, na.rm = TRUE)) %>% split(.$Dose)
      rows <- lapply(names(by_dose), function(d) {
        trt <- by_dose[[d]]$Response
        wt <- stats::wilcox.test(trt, ctrl, alternative = switch(tolower(alternative),
                                                                 "smaller" = "less", "greater" = "greater", "two-sided" = "two.sided"))
        data.frame(Dose = as.numeric(d), Actual_W = unname(wt$statistic), Actual_P = wt$p.value)
      })
      actual_df <- dplyr::bind_rows(rows)  # Note: metric names differ (W-value, P-value)
      group_means <- ed %>% dplyr::group_by(Dose) %>% dplyr::summarise(Actual_Mean = mean(Response, na.rm = TRUE), .groups = "drop")
      list(actual_df = actual_df, group_means = group_means)
    },
    metrics = c("Mean", "W-value", "P-value"),
    comparator = "many_to_one"
  ),

  # Kruskal–Wallis + Dunn’s post-hoc
  dunn = list(
    keyword = "Dunn's test",
    discover_fgs = function(res, data) build_generic_fgs(res, data, pattern = "Dunn's test"),
    run_actual = function(endpoint_data, alternative) {
      ed <- endpoint_data %>% dplyr::mutate(Dose = convert_dose(Dose))
      ed <- ed %>% dplyr::filter(!is.na(Dose))
      # KW omnibus
      kw <- stats::kruskal.test(Response ~ as.factor(Dose), data = ed)
      # Dunn post-hoc many-to-one vs control
      if (!requireNamespace("rstatix", quietly = TRUE)) stop("Please install rstatix for Dunn's test")
      dunn_res <- rstatix::dunn_test(ed, Response ~ as.factor(Dose), p.adjust.method = "bonferroni")  # adjust if needed
      # Extract many-to-one contrasts vs lowest Dose (control)
      ctrl_label <- as.character(min(ed$Dose, na.rm = TRUE))
      dunn_vs_ctrl <- dunn_res %>%
        dplyr::filter(grepl(paste0("^", ctrl_label, " - "), comparison) | grepl(paste0(" - ", ctrl_label, "$"), comparison)) %>%
        dplyr::mutate(Dose = dose_from_comparison(comparison)) %>%
        dplyr::rename(Actual_z = statistic, Actual_P = p.adj)
      actual_df <- dunn_vs_ctrl %>% dplyr::select(Dose, Actual_z, Actual_P)
      group_means <- ed %>% dplyr::group_by(Dose) %>% dplyr::summarise(Actual_Mean = mean(Response, na.rm = TRUE), .groups = "drop")
      list(actual_df = actual_df, group_means = group_means, kw = kw)
    },
    metrics = c("Mean", "z-value", "P-value"),  # plus optionally H-statistic for KW
    comparator = "many_to_one"
  ),

  # Fisher’s exact (binary endpoints; drcHelper provides many_to_one_fisher_test)
  fisher = list(
    keyword = "Fishers' test",
    discover_fgs = function(res, data) build_generic_fgs(res, data, pattern = "Fishers' test"),
    run_actual = function(endpoint_data, alternative) {
      # Structure depends on Alive/Dead counts; we’ll wire after seeing a concrete example
      stop("Fisher handler: please provide example endpoint_data (Alive/Dead/Total) and expected rows.")
    },
    metrics = c("P-value", "significance"),
    comparator = "many_to_one"
  ),

  # Model tests (Shapiro/Levene) on residuals
  shapiro = list(
    keyword = "Shapiro-Wilk",
    discover_fgs = function(res, data) build_generic_fgs(res, data, pattern = "Shapiro-Wilk"),
    run_actual = function(endpoint_data, alternative) {
      # Are these computed on residuals from Response ~ Dose? Please confirm
      stop("Shapiro handler: confirm target vector (residuals vs raw group).")
    },
    metrics = c("W-value", "P-value"),
    comparator = "per_group"
  ),
  levene = list(
    keyword = "Levene",
    discover_fgs = function(res, data) build_generic_fgs(res, data, pattern = "Levene"),
    run_actual = function(endpoint_data, alternative) {
      stop("Levene handler: confirm target (raw groups vs residuals).")
    },
    metrics = c("F-value", "P-value"),
    comparator = "per_group"
  )
)

Generic runner

  • You can use the run_consolidated_validation_generic function I provided earlier (it already handles Mean/T/P joins and long-format output).
  • For tests with different metric names, adjust the “metric splits” in that function based on reg$metrics and on Brief description labels:
    • Wilcoxon: “W-Value”
    • Dunn: “z-value”
    • Kruskal–Wallis: “H-statistic”
    • Significance: compute Actual_Significance as (Actual_P < alpha), if expected result value is 0/1; please confirm expected encoding.

Dose embedded in Brief description

  • Some rows encode the dose inside Brief description, e.g., “Dunn's test, smaller, 0,41, z-value”.
  • Add a parser that falls back to description when Dose is “n/a”:
dose_from_description <- function(desc) {
  # capture first numeric token allowing comma decimal
  m <- regmatches(desc, regexpr("[0-9]+(?:,[0-9]+)?", desc))
  ifelse(length(m) > 0, convert_dose(m), NA_real_)
}

When preparing expected tables, set Dose = coalesce(convert_dose(Dose), dose_from_description(Brief description)).

Key formulas you’ll likely need

  • Percent inhibition/reduction per dose relative to control (please confirm your definition):
    $$%Inhibition = 100 * (1 - mean_treatment / mean_control)$$
    $$%Reduction = 100 * (1 - mean_treatment / mean_control)$$
    If your expected results use a different baseline or direction, tell me and I’ll adapt.

What I need you to run so I can finalize handlers and metric mapping
Please run these on one Williams FG/study and paste the outputs:

  1. Show structure of Williams outputs from drcHelper:
  • Pick an endpoint_data for a Williams case (FG00120 looks promising). Then:
w_ep <- test_cases_data %>% dplyr::filter(`Study ID` == "MOCK0065")  # replace with the actual Williams study
w_ep$Dose_numeric <- convert_dose(w_ep$Dose)
w_out <- try(drcHelper::williamsTest_JG(data = w_ep, response_var = "Response", dose_var = "Dose_numeric"), silent = TRUE)
str(w_out)

bw_out <- try(drcHelper::broom_williams(data = w_ep, response_var = "Response", dose_var = "Dose_numeric"), silent = TRUE)
str(bw_out)
  1. Provide a small slice of expected Williams rows (10–20 rows):
test_cases_res %>%
  dplyr::filter(grepl("Williams", `Brief description`, ignore.case = TRUE)) %>%
  dplyr::select(`Function group ID`, `Study ID`, `Endpoint`, `Brief description`, Dose, `expected result value`) %>%
  dplyr::slice(1:20) %>% print(n = 20)

  1. For Dunn’s test (Kruskal–Wallis post-hoc), show a slice of expected rows:
test_cases_res %>%
  dplyr::filter(grepl("Dunn's test", `Brief description`, ignore.case = TRUE)) %>%
  dplyr::select(`Function group ID`, `Study ID`, `Endpoint`, `Brief description`, Dose, `expected result value`) %>%
  dplyr::slice(1:20) %>% print(n = 20)

  1. For Wilcoxon, show expected rows and a sample endpoint_data:
  • Expected:
test_cases_res %>%
  dplyr::filter(grepl("Wilcoxon test", `Brief description`, ignore.case = TRUE)) %>%
  dplyr::select(`Function group ID`, `Study ID`, `Endpoint`, `Brief description`, Dose, `expected result value`) %>%
  dplyr::slice(1:20) %>% print(n = 20)

  • Data groups:
test_cases_data %>%
  dplyr::filter(`Study ID` %in% unique(test_cases_res$`Study ID`[grepl("Wilcoxon test", test_cases_res$`Brief description`, ignore.case = TRUE)])) %>%
  dplyr::count(`Study ID`, `Test group`) %>% print(n = 50)

  1. For Fisher’s test, show endpoint_data columns (Alive/Dead/Total?) and expected rows:
  • Data:
test_cases_data %>%
  dplyr::filter(!is.na(Alive) | !is.na(Dead) | !is.na(Total)) %>%
  dplyr::slice(1:20) %>% print(n = 20)
  • Expected:
test_cases_res %>%
  dplyr::filter(grepl("Fishers' test", `Brief description`, ignore.case = TRUE)) %>%
  dplyr::select(`Function group ID`, `Study ID`, `Endpoint`, `Brief description`, Dose, `expected result value`) %>%
  dplyr::slice(1:20) %>% print(n = 20)
  1. For Shapiro/Levene “model test”: confirm target vector:
  • Are these computed on residuals from lm(Response ~ Dose_numeric)? If yes, we’ll fit the same lm, then use stats::shapiro.test on residuals and car::leveneTest on grouped raw data, or another agreed approach. Please confirm.
  • No, residuals are from lm(Response ~ Dose_factor)

Once I see these structures, I’ll wire the run_actual handlers to return the same shape as your Dunnett handler (actual_df + group_means), extend the expected metric parser for labels like “Tcrit”, “df”, “z-value”, “W-value”, “%Inhibition/%Reduction”, and finalize tolerances by metric/test.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions