From 45f6fcefffbe1c8f2564cb5d96ab4e2b59b62894 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 17 Dec 2025 12:37:43 +0000 Subject: [PATCH 1/3] Initial plan From 9435e48d0b34e4a430a2ac8138fd2643389ed5d0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 17 Dec 2025 12:49:04 +0000 Subject: [PATCH 2/3] Fix sampling for Array columns with nested inner types (List, Array, Struct) Co-authored-by: borchero <22455425+borchero@users.noreply.github.com> --- dataframely/columns/array.py | 42 +++++++++++++++++++++++++++++++++++- tests/columns/test_sample.py | 30 ++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py index ddd9e9d..ceda12a 100644 --- a/dataframely/columns/array.py +++ b/dataframely/columns/array.py @@ -117,9 +117,49 @@ def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series: n_elements = n * math.prod(self.shape) all_elements = self.inner.sample(generator, n_elements) + # For nested types (List, Array, Struct), we can't use reshape() directly + # because the inner type is not a scalar. Instead, we need to construct + # the nested structure manually. + from .list import List + from .struct import Struct + + if isinstance(self.inner, (List, Array, Struct)): + # Convert to a list and then group into arrays of the specified shape + all_elements_list = all_elements.to_list() + + def build_nested_structure(elements: list, shape: tuple[int, ...]) -> list: + """Build nested structure for a single array.""" + if len(shape) == 1: + # Base case: this is a 1D array + return elements + else: + # Recursive case: split into rows + row_size = math.prod(shape[1:]) + rows = [] + for i in range(shape[0]): + start = i * row_size + end = start + row_size + row_elements = elements[start:end] + rows.append(build_nested_structure(row_elements, shape[1:])) + return rows + + # Build n arrays, each with the specified shape + elements_per_array = math.prod(self.shape) + nested_arrays = [] + for i in range(n): + start = i * elements_per_array + end = start + elements_per_array + array_elements = all_elements_list[start:end] + nested_arrays.append(build_nested_structure(array_elements, self.shape)) + + result = pl.Series(nested_arrays, dtype=self.dtype) + else: + # For scalar types, use the original reshape approach + result = all_elements.reshape((n, *self.shape)) + # Finally, apply a null mask return generator._apply_null_mask( - all_elements.reshape((n, *self.shape)), + result, null_probability=self._null_probability, ) diff --git a/tests/columns/test_sample.py b/tests/columns/test_sample.py index d088847..254ffbe 100644 --- a/tests/columns/test_sample.py +++ b/tests/columns/test_sample.py @@ -190,6 +190,36 @@ def test_sample_array(generator: Generator) -> None: assert set(samples.arr.len()) == {2, None} +@pytest.mark.parametrize( + "arr_size,n_samples", + [ + (1, 1), + (2, 1), + (3, 5), + (1, 10), + ], +) +def test_sample_array_list(arr_size: int, n_samples: int, generator: Generator) -> None: + """Test sampling for Array(List(...), ...) which previously failed.""" + column = dy.Array(dy.List(dy.Bool()), arr_size) + samples = sample_and_validate(column, generator, n=n_samples) + assert len(samples) == n_samples + + +def test_sample_array_of_array(generator: Generator) -> None: + """Test sampling for Array(Array(...), ...).""" + column = dy.Array(dy.Array(dy.Bool(), 2), 3) + samples = sample_and_validate(column, generator, n=10) + assert len(samples) == 10 + + +def test_sample_array_of_struct(generator: Generator) -> None: + """Test sampling for Array(Struct(...), ...).""" + column = dy.Array(dy.Struct({"x": dy.Bool(), "y": dy.Integer()}), 2) + samples = sample_and_validate(column, generator, n=10) + assert len(samples) == 10 + + def test_sample_struct(generator: Generator) -> None: column = dy.Struct( {"a": dy.String(regex="[abc]"), "b": dy.String(regex="[a-z]xx")}, nullable=True From c3007ee0061593c909aa9138517f3d8ce7a0bb43 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 17 Dec 2025 12:52:29 +0000 Subject: [PATCH 3/3] Move imports to module level for better performance Co-authored-by: borchero <22455425+borchero@users.noreply.github.com> --- dataframely/columns/array.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py index ceda12a..e3ea833 100644 --- a/dataframely/columns/array.py +++ b/dataframely/columns/array.py @@ -15,7 +15,8 @@ from ._base import Check, Column from ._registry import column_from_dict, register -from .list import _list_primary_key_check +from .list import List, _list_primary_key_check +from .struct import Struct if sys.version_info >= (3, 11): from typing import Self @@ -120,9 +121,6 @@ def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series: # For nested types (List, Array, Struct), we can't use reshape() directly # because the inner type is not a scalar. Instead, we need to construct # the nested structure manually. - from .list import List - from .struct import Struct - if isinstance(self.inner, (List, Array, Struct)): # Convert to a list and then group into arrays of the specified shape all_elements_list = all_elements.to_list()