Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions cdisc_rules_engine/utilities/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,14 +198,13 @@ def merge_pivot_supp_dataset(
left_dataset: DatasetInterface,
right_dataset: DatasetInterface,
):

static_keys = ["STUDYID", "USUBJID", "APID", "POOLID", "SPDEVID"]
qnam_list = right_dataset["QNAM"].unique()
unique_idvar_values = right_dataset["IDVAR"].unique()
if len(unique_idvar_values) == 1:
right_dataset = DataProcessor.process_supp(right_dataset)
dynamic_key = right_dataset["IDVAR"].iloc[0]
is_blank = pd.isna(dynamic_key) or str(dynamic_key).strip() == ""
is_blank: bool = pd.isna(dynamic_key) or str(dynamic_key).strip() == ""
# Determine the common keys present in both datasets
common_keys = [
key
Expand Down Expand Up @@ -352,6 +351,14 @@ def process_supp(supp_dataset):
columns_to_drop = [
col for col in ["QNAM", "QVAL", "QLABEL"] if col in supp_dataset.columns
]
if "RDOMAIN" in supp_dataset.columns and supp_dataset["RDOMAIN"][0] == "DM":
excluded_columns = list(supp_dataset["QNAM"].unique()) + columns_to_drop
group_cols = [c for c in supp_dataset.columns if c not in excluded_columns]
supp_dataset = PandasDataset(
supp_dataset.data.groupby(group_cols, dropna=False, as_index=False).agg(
lambda x: (x.dropna().iloc[0] if not x.dropna().empty else pd.NA)
)
)
if columns_to_drop:
supp_dataset = supp_dataset.drop(labels=columns_to_drop, axis=1)
return supp_dataset
Expand Down
10 changes: 8 additions & 2 deletions cdisc_rules_engine/utilities/dataset_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ def preprocess( # noqa
else:
if self._is_split_domain(domain_name):
continue
target_domain_name: str = (
self._dataset_metadata.domain or self._dataset_metadata.name
)
file_infos: list[SDTMDatasetMetadata] = [
item
for item in datasets
Expand All @@ -104,12 +107,15 @@ def preprocess( # noqa
or (
domain_name == "SUPP--"
and (not self._dataset_metadata.is_supp)
and item.rdomain == self._dataset_metadata.domain
and item.rdomain == target_domain_name
)
)
]

if not file_infos:
if not file_infos and not (
(self._dataset_metadata.is_supp and domain_name == "SUPP--")
or self._dataset_metadata.name == "RELREC"
):
raise PreprocessingError(
f"Failed to find related dataset for '{domain_name}' in preprocessor"
)
Expand Down
169 changes: 169 additions & 0 deletions tests/QARegressionTests/test_Issues/test_CoreIssue1345.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import os
import subprocess

import pytest
import json
from conftest import get_python_executable


@pytest.fixture
def generate_report():
# Run the command in the terminal
command = [
f"{get_python_executable()}",
"-m",
"core",
"validate",
"-s",
"sdtmig",
"-v",
"3-4",
"-dv",
"2-1",
"-dxp",
os.path.join(
"tests",
"resources",
"CoreIssue1345",
"define_msg20_testsupp_core.xml",
),
"-d",
os.path.join(
"tests",
"resources",
"CoreIssue1345",
),
"-lr",
os.path.join(
"tests",
"resources",
"CoreIssue1345",
),
"-r",
"CDISC.SDTMIG.CG0019",
"-l",
"error",
"-ps",
"1",
"-of",
"json",
]
subprocess.run(command, check=True)
# Get the latest created Excel file
files = os.listdir()
json_files = [
file
for file in files
if file.startswith("CORE-Report-") and file.endswith(".json")
]
json_report_path = sorted(json_files)[-1]
json_report = json.load(open(json_report_path))
return json_report_path, json_report


@pytest.mark.regression
class TestCoreIssue1345:
def test_engine_correctly_merges_datasets_and_flags_row_uniqueness_issues(
self, generate_report
):
json_report_path, json_report = generate_report
dataset_filenames = {
d["filename"].upper() for d in json_report.get("Dataset_Details", [])
}

assert "DM" in dataset_filenames, "DM dataset is missing from Dataset_Details"
assert (
"SUPPDM" in dataset_filenames
), "SUPPDM dataset is missing from Dataset_Details"

# 2. check for DM / SUPPDM Issue_Details
dm_related_issues = [
issue
for issue in json_report.get("Issue_Details", [])
if issue.get("dataset", "").lower() in {"dm.json", "suppdm.json"}
]

assert not dm_related_issues, (
"Found issues related to DM/SUPPDM datasets:\n" f"{dm_related_issues}"
)

dm_related_summary = [
s
for s in json_report.get("Issue_Summary", [])
if s.get("dataset", "").lower() in {"dm.json", "suppdm.json"}
]

assert not dm_related_summary, (
"Found issue summary entries related to DM/SUPPDM:\n"
f"{dm_related_summary}"
)

ec_detail_issues = [
i
for i in json_report.get("Issue_Details", [])
if i.get("dataset", "").lower() == "ec.json"
]

assert (
ec_detail_issues
), "Expected EC-related issues in Issue_Details, but none found"
assert (
len(ec_detail_issues) == 2
), f"Expected 2 issues for EC dataset, but {len(ec_detail_issues)} found in Issue_Details"

ec_summary_issues = [
s
for s in json_report.get("Issue_Summary", [])
if s.get("dataset", "").lower() == "ec.json"
]

assert (
ec_summary_issues
), "Expected issues for EC dataset, but none found in Issue_Summary"

if os.path.exists(json_report_path):
os.remove(json_report_path)

def test_engine_correctly_processes_relrec_when_supp_datasets_provided(
self, generate_report
):
json_report_path, json_report = generate_report
# Open the JSON report file
dataset_filenames = {
d["filename"].upper() for d in json_report.get("Dataset_Details", [])
}

assert "DM" in dataset_filenames, "DM dataset is missing from Dataset_Details"
assert (
"SUPPDM" in dataset_filenames
), "SUPPDM dataset is missing from Dataset_Details"
assert "EC" in dataset_filenames, "EC dataset is missing from Dataset_Details"
assert (
"SUPPEC" in dataset_filenames
), "SUPPEC dataset is missing from Dataset_Details"

# check that relrec was processed and rule checked the data
assert (
"RELREC" in dataset_filenames
), "RELREC dataset is missing from Dataset_Details"
relrec_issues = [
i
for i in json_report.get("Issue_Details", [])
if i.get("dataset", "").lower() == "relrec.json"
]
assert (
len(relrec_issues) == 2
), f"Expected 2 issues for RELREC dataset, but {len(relrec_issues)} found"

# to confirm that EC is still processed and contains issues
ec_detail_issues = [
i
for i in json_report.get("Issue_Details", [])
if i.get("dataset", "").lower() == "ec.json"
]
assert (
len(ec_detail_issues) == 2
), f"Expected 2 issues for EC dataset, but {len(ec_detail_issues)} found in Issue_Details"

if os.path.exists(json_report_path):
os.remove(json_report_path)
104 changes: 104 additions & 0 deletions tests/resources/CoreIssue1345/CG0019.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Variable: GEN
# Condition:
# Rule: Each record is unique per sponsor defined key variables as documented in the define.xml
# Marcelina Hungria: testing with the CLI and a define.xml that includes a key from a supp dataset (ec/suppec),
# plus dm/suppdm and relrec
Authorities:
- Organization: CDISC
Standards:
- Name: SDTMIG
References:
- Citations:
- Cited Guidance:
Note that the key variables shown in this table are examples
only. A sponsor's actual key structure may be different.
Document: IG v3.4
Section: Table 3.2.1
- Cited Guidance:
Since the purpose of this column is to aid reviewers in
understanding the structure of a dataset, sponsors should list
all of the natural keys (see definition below) for the
dataset. These keys should define uniqueness for records
within a dataset, and may define a record sort order. The
identified keys for each dataset should be consistent with the
description of the dataset structure as described in the
Define-XML document.
Document: IG v3.4
Section: 3.2.1.1
Origin: SDTM and SDTMIG Conformance Rules
Rule Identifier:
Id: CG0019
Version: "1"
Version: "2.0"
Version: "3.4"
- Name: SDTMIG
References:
- Citations:
- Cited Guidance:
Table 3.2.1[Note that the key variables shown in this table are
examples only. A sponsor's actual key structure may be
different.]|3.2.1.1[Since the purpose of this column is to aid
reviewers in understanding the structure of a dataset,
sponsors should list all of the natural keys (see definition
below) for the dataset. These keys should define uniqueness
for records within a dataset, and may define a record sort
order.]
Document: IG v3.2
Section: Table 3.2.1|3.2.1.1
Origin: SDTM and SDTMIG Conformance Rules
Rule Identifier:
Id: CG0019
Version: "1"
Version: "2.0"
Version: "3.2"
- Name: SDTMIG
References:
- Citations:
- Cited Guidance:
Table 3.2.1[Note that the key variables shown in this table are
examples only. A sponsor's actual key structure may be
different.]||3.2.1.1[Since the purpose of this column is to
aid reviewers in understanding the structure of a dataset,
sponsors should list all of the natural keys (see definition
below) for the dataset. These keys should define uniqueness
for records within a dataset, and may define a record sort
order.]
Document: IG v3.3
Section: Table 3.2.1|3.2.1.1
Origin: SDTM and SDTMIG Conformance Rules
Rule Identifier:
Id: CG0019
Version: "1"
Version: "2.0"
Version: "3.3"
Check:
all:
- name: define_dataset_key_sequence
operator: is_not_unique_set
Core:
Id: CDISC.SDTMIG.CG0019
Status: Draft
Version: "1"
Description: Trigger error if records are not unique as per sponsor defined key
variables as documented in the define.xml
Executability: Fully Executable
Match Datasets:
- Is Relationship: true
Keys:
- USUBJID
Name: SUPP--
Outcome:
Message: Records are not unique as per sponsor defined key variables as
documented in the define.xml
# define_dataset_key_sequence doesn't appear to grab all variables needed. See define xml plus data included
Output Variables:
- define_dataset_key_sequence
Rule Type: Dataset Contents Check against Define XML
Scope:
Classes:
Include:
- ALL
Domains:
Include:
- ALL
Sensitivity: Record
Loading
Loading