From 83dd6d04facb0b5c6c6571428aab8fa5cf5f885e Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Tue, 5 May 2026 12:30:57 +0200 Subject: [PATCH] Remove size unit variable and related logic from dataset metadata handling --- .../content_metadata_dataset_builder.py | 2 - .../contents_define_dataset_builder.py | 2 - .../dataset_metadata_values_builder.py | 2 - .../data_services/base_data_service.py | 4 +- .../data_services/local_data_service.py | 13 +--- .../utilities/rule_processor.py | 10 --- cdisc_rules_engine/utilities/utils.py | 13 ---- .../test_values_dataset_metadata_builder.py | 2 - .../test_utilities/test_rule_processor.py | 77 ------------------- 9 files changed, 3 insertions(+), 122 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py index 9fa191320..58b8178fe 100644 --- a/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py @@ -12,8 +12,6 @@ def build(self): is_ap - Whether the domain is an AP domain ap_suffix - The 2-character suffix from AP domains """ - size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule) return self.data_service.get_dataset_metadata( dataset_name=self.dataset_metadata.name, - size_unit=size_unit, ) diff --git a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py index 9c21c5e72..e801a0e19 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py @@ -34,10 +34,8 @@ def build(self): dataset_name=self.dataset_metadata.name ) # Build dataset metadata dataframe - size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule) dataset_metadata = self.data_service.get_dataset_metadata( dataset_name=self.dataset_metadata.name, - size_unit=size_unit, ).to_dict(orient="records")[0] # Build define xml dataframe define = self.get_define_xml_item_group_metadata_for_dataset(dataset_metadata) diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py index eb8206f78..4355f28a8 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py @@ -19,10 +19,8 @@ def build(self): - is_ap - Whether the domain is an AP domain - ap_suffix - The 2-character suffix from AP domains """ - size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule) dataset_metadata = self.data_service.get_dataset_metadata( dataset_name=self.dataset_metadata.name, - size_unit=size_unit, ) dataset_metadata = dataset_metadata.to_dict(orient="records")[0] data_contents_long_df = super().build() diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index 9073e63d8..25a22f517 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -203,9 +203,7 @@ def get_data_structure( return OTHER @cached_dataset(DatasetTypes.METADATA.value) - def get_dataset_metadata( - self, dataset_name: str, size_unit: str = None, **params - ) -> DatasetInterface: + def get_dataset_metadata(self, dataset_name: str, **params) -> DatasetInterface: """ Gets metadata of a dataset and returns it as a DataFrame. """ diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 77eb5119a..8e9d2a1c2 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -1,7 +1,7 @@ import os from os.path import basename from io import IOBase -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Tuple from cdisc_rules_engine.interfaces import CacheServiceInterface, ConfigInterface from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata @@ -22,9 +22,6 @@ DatasetNDJSONMetadataReader, ) from cdisc_rules_engine.services.csv_metadata_reader import DatasetCSVMetadataReader -from cdisc_rules_engine.utilities.utils import ( - convert_file_size, -) from cdisc_rules_engine.exceptions.custom_exceptions import InvalidDatasetFormat from .base_data_service import BaseDataService, cached_dataset from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes @@ -249,16 +246,10 @@ def read_data(self, file_path: str) -> IOBase: def __get_dataset_metadata(self, dataset_path: str, **kwargs) -> Tuple[dict, dict]: """ - Internal method that gets dataset metadata - and converts file size if needed. + Internal method that gets dataset metadata. """ metadata: dict = self.__read_metadata(dataset_path) file_metadata: dict = metadata["file_metadata"] - size_unit: Optional[str] = kwargs.get("size_unit") - if size_unit: # convert file size from bytes to desired unit if needed - file_metadata["file_size"] = convert_file_size( - file_metadata["file_size"], size_unit - ) return file_metadata, metadata["contents_metadata"] def to_parquet(self, file_path: str) -> str: diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 1395947b2..dba483fce 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -519,16 +519,6 @@ def is_relationship_dataset(self, dataset_name: str) -> bool: ) return result - def get_size_unit_from_rule(self, rule: dict) -> Optional[str]: - """ - Extracts size unit from rule if it was passed - """ - rule_conditions: ConditionInterface = rule["conditions"] - for condition in rule_conditions.values(): - value: dict = condition["value"] - if value["target"] == "dataset_size": - return value.get("unit") - def add_operator_to_rule_conditions( self, rule: dict, target_to_operator_map: dict, domain: str ): diff --git a/cdisc_rules_engine/utilities/utils.py b/cdisc_rules_engine/utilities/utils.py index 8ee1c1e75..7ab88b900 100644 --- a/cdisc_rules_engine/utilities/utils.py +++ b/cdisc_rules_engine/utilities/utils.py @@ -38,19 +38,6 @@ def convert_dataclass_to_superclass[T](instance: object, superclass: type[T]) -> ) -def convert_file_size(size_in_bytes: int, desired_unit: str) -> float: - """ - Converts file size from bytes to any of the following units: - KB, MB, GB - """ - unit_to_denominator_map: dict = { - "KB": 1024, - "MB": 1024**2, - "GB": 1024**3, - } - return size_in_bytes / unit_to_denominator_map[desired_unit] - - def get_execution_status(results): """ If any result has an execution error, return execution error diff --git a/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py b/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py index 563563f63..4c258f9d4 100644 --- a/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py +++ b/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py @@ -52,7 +52,6 @@ def test_build_with_dataset_metadata(mock_build): rule_mock = MagicMock() rule_processor_mock = MagicMock() - rule_processor_mock.get_size_unit_from_rule.return_value = "KB" try: builder = ValueCheckDatasetMetadataDatasetBuilder( rule=rule_mock, @@ -133,7 +132,6 @@ def test_build_split_datasets(mock_build): rule_mock = MagicMock() rule_processor_mock = MagicMock() - rule_processor_mock.get_size_unit_from_rule.return_value = "KB" try: builder = ValueCheckDatasetMetadataDatasetBuilder( rule=rule_mock, diff --git a/tests/unit/test_utilities/test_rule_processor.py b/tests/unit/test_utilities/test_rule_processor.py index e0f0ed23a..33b14688d 100644 --- a/tests/unit/test_utilities/test_rule_processor.py +++ b/tests/unit/test_utilities/test_rule_processor.py @@ -3,7 +3,6 @@ import pandas as pd import pytest -from conftest import mock_data_service from cdisc_rules_engine.exceptions.custom_exceptions import DomainNotFoundError from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.models.rule_conditions import ConditionCompositeFactory @@ -1023,82 +1022,6 @@ def test_extract_target_names_from_rule_output_variables(): ] -@pytest.mark.parametrize( - "conditions", - [ - { - "any": [ - { - "value": { - "target": "dataset_label", - "comparator": "Adverse Events", - }, - "operator": "equal_to", - }, - { - "value": {"target": "dataset_size", "unit": "MB", "comparator": 5}, - "operator": "less_than", - }, - ] - }, - { - "any": [ - { - "value": { - "target": "dataset_label", - "comparator": "Adverse Events", - }, - "operator": "equal_to", - }, - { - "all": [ - { - "value": { - "target": "dataset_size", - "unit": "MB", - "comparator": 5, - }, - "operator": "less_than", - }, - ] - }, - ] - }, - { - "not": { - "any": [ - { - "value": { - "target": "dataset_label", - "comparator": "Adverse Events", - }, - "operator": "equal_to", - }, - { - "all": [ - { - "value": { - "target": "dataset_size", - "unit": "MB", - "comparator": 5, - }, - "operator": "less_than", - }, - ] - }, - ] - } - }, - ], -) -def test_get_size_unit_from_rule(conditions: dict): - rule: dict = { - "conditions": ConditionCompositeFactory.get_condition_composite(conditions), - } - processor = RuleProcessor(mock_data_service, InMemoryCacheService()) - assert processor.get_size_unit_from_rule(rule) == "MB" - - def test_duplicate_for_targets(): """ Unit test for ConditionComposite.add_variable_condtions method.