From b9643ac9c251f7a4b4dc05d496393dce63c26d4c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 09:29:51 +0000 Subject: [PATCH 1/3] Initial plan From 63fbf3a4eb6790ea17981aba49f65d6fe4f8e205 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 09:37:42 +0000 Subject: [PATCH 2/3] Fix List sampling when min_length > 32 Co-authored-by: jjurm <6285777+jjurm@users.noreply.github.com> --- dataframely/columns/list.py | 7 +++++-- tests/column_types/test_list.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/dataframely/columns/list.py b/dataframely/columns/list.py index 5c22d24..6c1c66a 100644 --- a/dataframely/columns/list.py +++ b/dataframely/columns/list.py @@ -131,9 +131,12 @@ def pyarrow_dtype(self) -> pa.DataType: def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series: # First, sample the number of items per list element # NOTE: We default to 32 for the upper bound as we need some kind of reasonable - # upper bound if none is set. + # upper bound if none is set. If min_length is greater than 32, we use + # min_length as the default upper bound instead. + min_len = self.min_length or 0 + default_max = max(32, min_len) element_lengths = generator.sample_int( - n, min=self.min_length or 0, max=(self.max_length or 32) + 1 + n, min=min_len, max=(self.max_length or default_max) + 1 ) # Then, we can sample the inner elements in a flat series diff --git a/tests/column_types/test_list.py b/tests/column_types/test_list.py index e860b85..44e17a0 100644 --- a/tests/column_types/test_list.py +++ b/tests/column_types/test_list.py @@ -179,3 +179,13 @@ def test_inner_primary_key_struct( _, failure = schema.filter(df) assert failure.counts() == {"a|primary_key": failure_count} assert validation_mask(df, failure).to_list() == mask + + +@pytest.mark.parametrize("min_length", [0, 10, 33, 100]) +def test_list_sampling_with_min_length(min_length: int) -> None: + """Test that sampling works correctly when min_length > 32.""" + schema = create_schema("test", {"a": dy.List(dy.Int64(), min_length=min_length)}) + df = schema.sample(num_rows=10) + assert len(df) == 10 + # Verify all lists have at least min_length elements + assert df["a"].list.len().min() >= min_length From e1116b3246b5e3e8ee2402d08395049994d7afe2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 11:02:03 +0000 Subject: [PATCH 3/3] Fix mypy errors in test_list.py Co-authored-by: jjurm <6285777+jjurm@users.noreply.github.com> --- tests/column_types/test_list.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/column_types/test_list.py b/tests/column_types/test_list.py index 44e17a0..44bb858 100644 --- a/tests/column_types/test_list.py +++ b/tests/column_types/test_list.py @@ -1,6 +1,8 @@ # Copyright (c) QuantCo 2025-2025 # SPDX-License-Identifier: BSD-3-Clause +from typing import cast + import polars as pl import pytest @@ -188,4 +190,5 @@ def test_list_sampling_with_min_length(min_length: int) -> None: df = schema.sample(num_rows=10) assert len(df) == 10 # Verify all lists have at least min_length elements - assert df["a"].list.len().min() >= min_length + min_list_len = cast(int, df["a"].list.len().min()) + assert min_list_len >= min_length