Skip to content

[Bug]: Dropping variables when merging datasets with merge_expression_module #262

@llrs-roche

Description

@llrs-roche

What happened?

When extracting and merging from more than two data sources with merge_expression_module it seems like the variables selected from the third that are not part of the primary key are dropped even if they are required/selected by the app:

Details

library(shiny)
library(teal.data)
library(teal.widgets)

ADSL <- data.frame(
  STUDYID = "A",
  USUBJID = LETTERS[1:10],
  SEX = rep(c("F", "M"), 5),
  AGE = rpois(10, 30),
  BMRKR1 = rlnorm(10)
)
ADLB <- expand.grid(
  STUDYID = "A",
  USUBJID = LETTERS[1:10],
  PARAMCD = c("ALT", "CRP", "IGA"),
  AVISIT = c("SCREENING", "BASELINE", "WEEK 1 DAY 8", "WEEK 2 DAY 15")
)
ADLB$AVAL <- rlnorm(120)
ADLB$CHG <- rnorm(120)

data_list <- list(
  ADSL = reactive(ADSL),
  ADLB = reactive(ADLB),
  ADAE = reactive(teal.data::rADAE)
)

join_keys <- join_keys(
  join_key("ADSL", "ADSL", c("STUDYID", "USUBJID")),
  join_key("ADSL", "ADLB", c("STUDYID", "USUBJID")),
  join_key("ADSL", "ADAE", c("STUDYID", "USUBJID")),
  join_key("ADLB", "ADAE", c("STUDYID", "USUBJID")),
  join_key("ADAE", "ADAE", c("STUDYID", "USUBJID")),
  join_key("ADLB", "ADLB", c("STUDYID", "USUBJID", "PARAMCD", "AVISIT"))
)

adsl_extract <- data_extract_spec(
  dataname = "ADSL",
  select = select_spec(
    label = "Select variable:",
    choices = c("AGE", "BMRKR1"),
    selected = "AGE",
    multiple = TRUE,
    fixed = FALSE
  )
)

adlb_extract <- data_extract_spec(
  dataname = "ADLB",
  filter = filter_spec(vars = "PARAMCD", choices = c("ALT", "CRP", "IGA"), selected = "ALT"),
  select = select_spec(
    label = "Select variable:",
    choices = c("AVAL", "CHG"),
    selected = "AVAL",
    multiple = TRUE,
    fixed = FALSE
  )
)
adae_extract <- data_extract_spec(
  dataname = "ADAE",
  select = select_spec(
    label = "Select variable:",
    choices = c("RACE", "ETHNIC"),
    selected = "RACE",
    multiple = TRUE,
    fixed = FALSE
  )
)

ui <- bslib::page_fluid(
  bslib::layout_sidebar(
    tags$div(
      verbatimTextOutput("expr"),
      dataTableOutput("data")
    ),
    sidebar = tagList(
      data_extract_ui("adsl_var", label = "ADSL selection", adsl_extract),
      data_extract_ui("adlb_var", label = "ADLB selection", adlb_extract),
      data_extract_ui("adae_var", label = "ADAE selection", adae_extract)
    )
  )
)

server <- function(input, output, session) {
  data_q <- qenv()
  
  data_q <- eval_code(
    data_q,
    "ADSL <- data.frame(
        STUDYID = 'A',
        USUBJID = LETTERS[1:10],
        SEX = rep(c('F', 'M'), 5),
        AGE = rpois(10, 30),
        BMRKR1 = rlnorm(10)
      )"
  )
  
  data_q <- eval_code(
    data_q,
    "ADLB <- expand.grid(
        STUDYID = 'A',
        USUBJID = LETTERS[1:10],
        PARAMCD = c('ALT', 'CRP', 'IGA'),
        AVISIT = c('SCREENING', 'BASELINE', 'WEEK 1 DAY 8', 'WEEK 2 DAY 15'),
        AVAL = rlnorm(120),
        CHG = rlnorm(120)
       )"
  )
  
  data_q <- within(data_q, {
    ADAE <- teal.data::rADAE
  })
  
  merged_data <- merge_expression_module(
    data_extract = list(adsl_var = adsl_extract, 
                        adlb_var = adlb_extract, 
                        adae_bar = adae_extract),
    datasets = data_list,
    join_keys = join_keys,
    merge_function = "dplyr::left_join"
  )
  
  code_merge <- reactive({
    for (exp in merged_data()$expr) data_q <- eval_code(data_q, exp)
    data_q
  })
  
  output$expr <- renderText(paste(merged_data()$expr, collapse = "\n"))
  output$data <- renderDataTable(code_merge()[["ANL"]])
}

if (interactive()) {
  shinyApp(ui, server)
}

Code/expression generated by the function:

ANL_1 <- ADSL %>% dplyr::select(STUDYID, USUBJID, AGE)
ANL_2 <- ADLB %>% dplyr::filter(PARAMCD == "ALT") %>% dplyr::select(STUDYID, USUBJID, AVAL)
ANL_3 <- ADAE %>% dplyr::select(STUDYID, USUBJID) # RACE is selected by the APP
ANL <- ANL_1
ANL <- dplyr::left_join(ANL, ANL_2, by = c("STUDYID", "USUBJID"))
ANL <- dplyr::left_join(ANL, ANL_3, by = c("STUDYID", "USUBJID"))

Expected code generated by the app:

ANL_1 <- ADSL %>% dplyr::select(STUDYID, USUBJID, AGE)
ANL_2 <- ADLB %>% dplyr::filter(PARAMCD == "ALT") %>% dplyr::select(STUDYID, USUBJID, AVAL)
ANL_3 <- ADAE # Or %>% dplyr::select(STUDYID, USUBJID, RACE)
ANL <- ANL_1
ANL <- dplyr::left_join(ANL, ANL_2, by = c("STUDYID", "USUBJID"))
ANL <- dplyr::left_join(ANL, ANL_3, by = c("STUDYID", "USUBJID"))

While it is labeled as experimental I saw it first mentioned in teal.transform 0.2.0 (which I don't know how long ago it was) but it is clearly used across the framework for example in TMC modules (where I found it while developing a new module)

sessionInfo()

Relevant log output

Code of Conduct

  • I agree to follow this project's Code of Conduct.

Contribution Guidelines

  • I agree to follow this project's Contribution Guidelines.

Security Policy

  • I agree to follow this project's Security Policy.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions