diff --git a/dpsynth/bin/derive_domain.py b/dpsynth/bin/derive_domain.py deleted file mode 100644 index 20e5df6..0000000 --- a/dpsynth/bin/derive_domain.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Derive the domain from a given dataset and write it out to storage.""" - -import os - -from absl import app -from absl import flags -from absl import logging -from dpsynth import domain -from dpsynth.bin import _read_csv_args -import fancyflags as ff -import numpy as np -import pandas as pd - -import pathlib -PathType = pathlib.Path - -_DATASET_PATH = flags.DEFINE_string( - 'dataset_path', - 'adult.csv', - 'Path to the dataset to derive the domain from.', -) - -_OUTPUT_DIR = flags.DEFINE_string( - 'output_dir', - None, - 'Path to the output directory to write the domain to.', -) - -_CSV_READ_ARGS = ff.DEFINE_auto( - 'csv_read_args', - _read_csv_args.ReadCsvArgs, - _read_csv_args.FLAG_HELP, -) - - -_NUMERICAL_SENTINEL_VALUE = flags.DEFINE_integer( - 'numerical_sentinel_value', - None, - 'Sentinel value to use for numerical columns.', -) - - -def _create_numerical_attribute( - df_col: pd.Series, - dtype_str: str, - numerical_sentinel_value: int | None = None, -) -> domain.NumericalAttribute | domain.CategoricalAttribute: - """Creates a numerical attribute from a pandas Series.""" - clip_to_range = True - if numerical_sentinel_value is not None and any( - value == numerical_sentinel_value for value in df_col.values - ): - # Replace the sentinel value with NaN so that it is not used in the range - # computation. - df_col = df_col.replace(numerical_sentinel_value, np.nan) - clip_to_range = False - - max_value = df_col.max() - min_value = df_col.min() - - # If all values are NaN, return a categorical attribute with a single - # sentinel value or None. This typically happens when the column is empty. - if np.isnan(min_value): - if not np.isnan(max_value): - raise ValueError( - 'max_value is not NaN but min_value can be NaN only if all values are' - ' NaN. This is unexpected.' - ) - - return domain.CategoricalAttribute( - possible_values=[ - numerical_sentinel_value if numerical_sentinel_value else None - ] - ) - - # Check that the sentinel value is not in the range [min_value, max_value]. - if ( - numerical_sentinel_value is not None - and numerical_sentinel_value >= min_value - and numerical_sentinel_value <= max_value - ): - raise ValueError( - f'Sentinel value {numerical_sentinel_value} should be outside the range' - f' [{min_value}, {max_value}]' - ) - - # If the min and max values are the same, return a categorical attribute with - # a single value. - if min_value == max_value: - return domain.CategoricalAttribute(possible_values=[min_value]) - - return domain.NumericalAttribute( - min_value=min_value, - max_value=max_value, - clip_to_range=clip_to_range, - dtype=dtype_str, - ) - - -def derive_domain_from_data( - df: pd.DataFrame, - numerical_sentinel_value: int | None = None, -) -> dict[str, domain.AttributeType]: - """Derive the domain from a given dataset.""" - result = {} - for col in df.columns: - logging.info('Deriving domain for column: %s', col) - match df[col].dtype: - case 'object': - result[col] = domain.CategoricalAttribute( - possible_values=sorted( - df[col].unique(), - key=lambda x: (isinstance(x, str), x), # sort ints before strs. - ) - ) - case 'int': - result[col] = _create_numerical_attribute( - df[col], 'int', numerical_sentinel_value - ) - case 'float': - result[col] = _create_numerical_attribute( - df[col], 'float', numerical_sentinel_value - ) - case _: - raise ValueError(f'Unsupported dtype: {df[col].dtype}') - return result - - -def _get_yaml_filename(dataset_path: PathType) -> str: - return os.path.basename(dataset_path) + '_domain.yaml' - - -def main(_) -> None: - read_csv_kwargs = _CSV_READ_ARGS.value().to_read_csv_kwargs() - - # If output_dir is not set, use the parent directory of the dataset path. - output_dir = _OUTPUT_DIR.value - if not output_dir: - output_dir = _DATASET_PATH.value.parent - - dataset_path = pathlib.Path(_DATASET_PATH.value) - if output_dir: - output_dir_path = pathlib.Path(output_dir) - else: - output_dir_path = dataset_path.parent - yaml_path = output_dir_path / _get_yaml_filename(dataset_path) - - df = pd.read_csv( - filepath_or_buffer=str(_DATASET_PATH.value), **read_csv_kwargs - ) - dom = derive_domain_from_data(df, _NUMERICAL_SENTINEL_VALUE.value) - logging.info('Writing domain to %s', yaml_path) - domain.to_yaml_file(dom, yaml_path) - - -if __name__ == '__main__': - app.run(main) diff --git a/dpsynth/eval/correlation_computation.py b/dpsynth/eval/correlation_computation.py index 60a9968..c181166 100644 --- a/dpsynth/eval/correlation_computation.py +++ b/dpsynth/eval/correlation_computation.py @@ -15,6 +15,7 @@ """Computes two-way attribute correlations.""" from collections.abc import Iterable +import dataclasses import itertools from typing import Any from dpsynth.eval import types diff --git a/dpsynth/eval/one_way_distribution_computation.py b/dpsynth/eval/one_way_distribution_computation.py index ca947be..9390019 100644 --- a/dpsynth/eval/one_way_distribution_computation.py +++ b/dpsynth/eval/one_way_distribution_computation.py @@ -15,6 +15,7 @@ """Computes one-way marginal distances.""" from collections.abc import Iterable +import dataclasses from typing import Any from dpsynth.eval import types from dpsynth.pipeline_transformations import diagnostic_info diff --git a/dpsynth/pipeline_transformations/diagnostic_info.py b/dpsynth/pipeline_transformations/diagnostic_info.py index 8d26c04..baf5b5c 100644 --- a/dpsynth/pipeline_transformations/diagnostic_info.py +++ b/dpsynth/pipeline_transformations/diagnostic_info.py @@ -15,6 +15,10 @@ """Module for updating diagnostic information.""" import copy + +from dpsynth.pipeline_transformations import types # pylint: disable=g-bad-import-order +import pipeline_dp # pylint: disable=g-bad-import-order + from dataclasses import dataclass, field from typing import Optional diff --git a/pyproject.toml b/pyproject.toml index 8fa36bc..7b134a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,3 +51,6 @@ version = {attr = "dpsynth.__version__"} [tool.setuptools.packages.find] namespaces = false + +[tool.pytest.ini_options] +addopts = "--import-mode=importlib" diff --git a/tests/bin/derive_domain_test.py b/tests/bin/derive_domain_test.py deleted file mode 100644 index 021e330..0000000 --- a/tests/bin/derive_domain_test.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from absl.testing import absltest -from dpsynth import domain -from dpsynth.bin import derive_domain -import numpy as np -import pandas as pd - - -class DeriveDomainTest(absltest.TestCase): - - def test_derive_domain_from_data(self): - df = pd.DataFrame({ - 'cat': ['A', 'B', 'C', 'A', 'B', 'C'], - 'int': [0, 1, 2, 3, 4, 5], - 'float': [3.14, 2.72, 1.61, 1.41, 1.23, 1.05], - }) - expected_domain = { - 'cat': domain.CategoricalAttribute(possible_values=['A', 'B', 'C']), - 'int': domain.NumericalAttribute( - min_value=0, max_value=5, clip_to_range=True, dtype='int' - ), - 'float': domain.NumericalAttribute( - min_value=1.05, max_value=3.14, dtype='float' - ), - } - self.assertEqual(derive_domain.derive_domain_from_data(df), expected_domain) - - def test_derive_domain_from_data_single_value(self): - df = pd.DataFrame({ - 'cat': ['A', 'B', 'C', 'A', 'B', 'C'], - 'int': [0, 1, 2, 3, 4, 5], - 'int_single_value': [5, 5, 5, 5, 5, 5], - 'float': [3.14, 2.72, 1.61, 1.41, 1.23, 1.05], - 'float_single_value': [3.14, 3.14, 3.14, 3.14, 3.14, 3.14], - }) - expected_domain = { - 'cat': domain.CategoricalAttribute(possible_values=['A', 'B', 'C']), - 'int': domain.NumericalAttribute( - min_value=0, max_value=5, clip_to_range=True, dtype='int' - ), - 'int_single_value': domain.CategoricalAttribute( - possible_values=[5], - ), - 'float': domain.NumericalAttribute( - min_value=1.05, max_value=3.14, dtype='float' - ), - 'float_single_value': domain.CategoricalAttribute( - possible_values=[3.14], - ), - } - self.assertEqual(derive_domain.derive_domain_from_data(df), expected_domain) - - def test_derive_domain_from_data_nan(self): - df = pd.DataFrame({ - 'cat': ['A', 'B', 'C', 'A', 'B', 'C'], - 'int': [0, 1, 2, 3, 4, 5], - 'float': [np.nan, 2.72, 1.61, 1.41, 1.23, 1.05], - }) - expected_domain = { - 'cat': domain.CategoricalAttribute(possible_values=['A', 'B', 'C']), - 'int': domain.NumericalAttribute( - min_value=0, max_value=5, clip_to_range=True, dtype='int' - ), - 'float': domain.NumericalAttribute( - min_value=1.05, max_value=2.72, dtype='float' - ), - } - self.assertEqual(derive_domain.derive_domain_from_data(df), expected_domain) - - def test_derive_domain_from_data_only_nan(self): - df = pd.DataFrame({ - 'cat': ['A', 'B', 'C', 'A', 'B', 'C'], - 'int': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - 'float': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - }) - - # This is a hack to get around the fact that we can't compare - # np.nan for equality. np.nan == np.nan is always False. - - self.assertEqual( - derive_domain.derive_domain_from_data(df), - { - 'cat': domain.CategoricalAttribute(possible_values=['A', 'B', 'C']), - 'int': domain.CategoricalAttribute( - possible_values=[None], - ), - 'float': domain.CategoricalAttribute( - possible_values=[None], - ), - }, - ) - - def test_derive_domain_from_single_value_and_nan(self): - df = pd.DataFrame({ - 'cat': ['A', 'B', 'C', 'A', 'B', 'C'], - 'int': [0, 1, 2, 3, 4, 5], - 'int_single_value': [5, 5, 5, np.nan, np.nan, np.nan], - 'float': [1.0, 2.72, 1.61, 1.41, 1.23, 1.05], - 'float_single_value': [np.nan, 3.14, np.nan, 3.14, np.nan, 3.14], - }) - expected_domain = { - 'cat': domain.CategoricalAttribute(possible_values=['A', 'B', 'C']), - 'int': domain.NumericalAttribute( - min_value=0, max_value=5, clip_to_range=True, dtype='int' - ), - 'int_single_value': domain.CategoricalAttribute( - possible_values=[5], - ), - 'float': domain.NumericalAttribute( - min_value=1.0, max_value=2.72, dtype='float' - ), - 'float_single_value': domain.CategoricalAttribute( - possible_values=[3.14], - ), - } - self.assertEqual(derive_domain.derive_domain_from_data(df), expected_domain) - - def test_derive_domain_from_data_sentinel_value(self): - df = pd.DataFrame({ - 'cat': ['A', 'B', 'C', 'A', 'B', 'C'], - 'int': [0, 1, 2, 3, 4, 5], - 'int_sentinel_value': [-1, 1, 2, 3, 4, -1], - 'float': [3.14, 2.72, 1.61, 1.41, 1.23, 1.05], - 'float_sentinel_value': [-1, 2.72, 1.61, 1.41, 1.23, -1], - }) - expected_domain = { - 'cat': domain.CategoricalAttribute(possible_values=['A', 'B', 'C']), - 'int': domain.NumericalAttribute( - min_value=0, max_value=5, clip_to_range=True, dtype='int' - ), - 'int_sentinel_value': domain.NumericalAttribute( - min_value=1, max_value=4, clip_to_range=False, dtype='int' - ), - 'float': domain.NumericalAttribute( - min_value=1.05, max_value=3.14, dtype='float' - ), - 'float_sentinel_value': domain.NumericalAttribute( - min_value=1.23, max_value=2.72, clip_to_range=False, dtype='float' - ), - } - self.assertEqual( - derive_domain.derive_domain_from_data(df, -1), expected_domain - ) - - def test_derive_domain_from_data_sentinel_value_invalid(self): - df = pd.DataFrame({ - 'cat': ['A', 'B', 'C', 'A', 'B', 'C'], - 'int': [0, 1, 2, 3, 4, 5], - 'int_sentinel_value': [-1, 1, 2, 3, 4, -1], - 'float': [3.14, 2.72, 1.61, 1.41, 1.23, 1.05], - 'float_sentinel_value': [-1, 2.72, 1.61, 1.41, 1.23, -1], - }) - with self.assertRaises(ValueError): - derive_domain.derive_domain_from_data(df, 0) - - def test_derive_domain_categorical_attribute_with_ints(self): - df = pd.DataFrame({'cat': ['A', 'B', 'C', -1, 'D', 1]}) - expected_domain = { - 'cat': domain.CategoricalAttribute( - possible_values=[-1, 1, 'A', 'B', 'C', 'D'] - ), - } - self.assertEqual(derive_domain.derive_domain_from_data(df), expected_domain) - - -if __name__ == '__main__': - absltest.main()