[FEATURE] RDS-1528: Add optional drop argument to add_column

lipikaramaswamy · lipikaramaswamy · commit d8800d3f9ed0 · 2025-05-20T18:21:12.000Z
GitOrigin-RevId: 23262f480128d45d63252c38b52714029be693b9
diff --git a/src/gretel_client/data_designer/data_designer.py b/src/gretel_client/data_designer/data_designer.py
@@ -408,9 +408,15 @@ def with_evaluation_report(
         self._evaluation_report = GeneralDatasetEvaluation(
             settings=settings
             or EvaluateDataDesignerDatasetSettings(
-                llm_judge_columns=[c.name for c in self.llm_judge_columns],
-                validation_columns=[c.name for c in self.code_validation_columns],
-                defined_categorical_columns=[c.name for c in self._categorical_columns],
+                llm_judge_columns=[
+                    c.name for c in self.llm_judge_columns if not c.drop
+                ],
+                validation_columns=[
+                    c.name for c in self.code_validation_columns if not c.drop
+                ],
+                defined_categorical_columns=[
+                    c.name for c in self._categorical_columns if not c.drop
+                ],
             )
         )
         return self
@@ -592,6 +598,21 @@ def validate(self) -> Self:
         self._build_workflow()
         # Run semantic validation on full schema.
         violations = self._run_semantic_validation()
+
+        # Ensure all columns are not dropped
+        remaining_cols = [
+            name
+            for name in self._columns
+            if name not in self._latent_person_columns
+            and name not in self._drop_columns
+        ]
+
+        if len(remaining_cols) == 0:
+            raise DataDesignerValidationError(
+                "🛑 All generated columns are configured to be dropped. Please mark at "
+                "least one column with `drop=False`."
+            )
+
         if len(violations) == 0:
             logger.info("Validation passed ✅")
         return self
@@ -665,6 +686,13 @@ def _categorical_columns(self) -> list[SamplerColumn]:
             if (col.type == SamplerType.CATEGORY or col.type == SamplerType.SUBCATEGORY)
         ]
 
+    @property
+    def _drop_columns(self) -> list[str]:
+        """Names of columns marked with drop=True (computed on demand)."""
+        return [
+            name for name, col in self._columns.items() if getattr(col, "drop", False)
+        ]
+
     @handle_workflow_validation_error
     def _build_workflow(
         self,
@@ -773,7 +801,7 @@ def _build_workflow(
             last_step_added = next_step
 
         ########################################################
-        # Drop all latent columns from the final dataset
+        # Drop intermediate columns (`drop=True`) and latent person columns
         ########################################################
 
         if len(self._latent_person_columns) > 0:
@@ -790,6 +818,18 @@ def _build_workflow(
             )
             last_step_added = drop_latent_columns_step
 
+        if self._drop_columns:
+            drop_cols_step = self._task_registry.DropColumns(columns=self._drop_columns)
+            builder.add_step(
+                step=drop_cols_step,
+                step_inputs=[last_step_added],
+                step_name=(
+                    f"dropping-{len(self._drop_columns)}-intermediate-column"
+                    f"{'s' if len(self._drop_columns) != 1 else ''}"
+                ),
+            )
+            last_step_added = drop_cols_step
+
         ########################################################
         # Run dataset evaluation if requested
         ########################################################
@@ -806,11 +846,15 @@ def _build_workflow(
                 )
             else:
                 general_eval_step = self._task_registry.EvaluateDataDesignerDataset(
-                    llm_judge_columns=[c.name for c in self.llm_judge_columns],
+                    llm_judge_columns=[
+                        c.name for c in self.llm_judge_columns if not c.drop
+                    ],
                     columns_to_ignore=settings.columns_to_ignore,
-                    validation_columns=settings.validation_columns,
+                    validation_columns=[
+                        c.name for c in self.code_validation_columns if not c.drop
+                    ],
                     defined_categorical_columns=[
-                        c.name for c in self._categorical_columns
+                        c.name for c in self._categorical_columns if not c.drop
                     ],
                 )
             builder.add_step(
diff --git a/src/gretel_client/data_designer/types.py b/src/gretel_client/data_designer/types.py
@@ -128,6 +128,17 @@ class SeedDataset(AIDDConfigBase):
 ##########################################################
 
 
+class WithDropColumnMixin(BaseModel):
+    """Adds a `drop` flag to indicate the column should be
+    removed from the final dataset before evaluation."""
+
+    drop: bool = Field(
+        default=False,
+        description="If true, remove this column from the final dataset "
+        "before evaluation.",
+    )
+
+
 class WithDAGColumnMixin:
     @property
     def required_columns(self) -> list[str]:
@@ -138,7 +149,7 @@ def side_effect_columns(self) -> list[str]:
         return []
 
 
-class SamplerColumn(WithPrettyRepr, tasks.ConditionalDataColumn):
+class SamplerColumn(WithDropColumnMixin, WithPrettyRepr, tasks.ConditionalDataColumn):
     """AIDD column that uses a sampler to generate data.
 
     Sampler columns can be conditioned on other sampler columns using the `conditional_params` argument,
@@ -208,7 +219,10 @@ def unpack(cls, column: SerializableConditionalDataColumn | dict) -> Self:
 
 
 class LLMGenColumn(
-    WithPrettyRepr, tasks.GenerateColumnFromTemplateV2, WithDAGColumnMixin
+    WithDropColumnMixin,
+    WithPrettyRepr,
+    tasks.GenerateColumnFromTemplateV2,
+    WithDAGColumnMixin,
 ):
     @model_validator(mode="before")
     @classmethod
@@ -306,7 +320,9 @@ class LLMStructuredColumn(LLMGenColumn):
     output_type: OutputType = Field(default=OutputType.STRUCTURED)
 
 
-class LLMJudgeColumn(WithPrettyRepr, tasks.JudgeWithLlm, WithDAGColumnMixin):
+class LLMJudgeColumn(
+    WithDropColumnMixin, WithPrettyRepr, tasks.JudgeWithLlm, WithDAGColumnMixin
+):
     """AIDD column for llm-as-a-judge with custom rubrics.
 
     Args:
@@ -334,7 +350,9 @@ def step_name(self) -> str:
         return f"using-llm-to-judge-column-{self.name}"
 
 
-class CodeValidationColumn(WithPrettyRepr, AIDDConfigBase, WithDAGColumnMixin):
+class CodeValidationColumn(
+    WithDropColumnMixin, WithPrettyRepr, AIDDConfigBase, WithDAGColumnMixin
+):
     """AIDD column for validating code in another column.
 
     Code validation is currently supported for Python and SQL.
@@ -371,7 +389,10 @@ def step_name(self) -> str:
 
 
 class ExpressionColumn(
-    WithPrettyRepr, tasks.GenerateColumnFromExpression, WithDAGColumnMixin
+    WithDropColumnMixin,
+    WithPrettyRepr,
+    tasks.GenerateColumnFromExpression,
+    WithDAGColumnMixin,
 ):
     """AIDD column for generated data based on jinja2 expressions.
 
diff --git a/src/gretel_client/data_designer/viz_tools.py b/src/gretel_client/data_designer/viz_tools.py
@@ -54,6 +54,7 @@ class AIDDMetadata(BaseModel):
     validation_columns: list[str] = []
     expression_columns: list[str] = []
     evaluation_columns: list[str] = []
+    drop_columns: list[str] = []
     person_samplers: list[str] = []
     code_langs: list[CodeLang | str] = []
     eval_type: LLMJudgePromptTemplateType | None = None
@@ -104,6 +105,7 @@ def from_aidd(cls, aidd: "DataDesigner") -> Self:
             llm_judge_columns=[col.name for col in aidd.llm_judge_columns],
             validation_columns=code_validation_columns,
             expression_columns=[col.name for col in aidd.expression_columns],
+            drop_columns=aidd._drop_columns,
             person_samplers=list(aidd._latent_person_columns.keys()),
             code_langs=[col.output_format for col in aidd.llm_code_columns],
             eval_type=None,
@@ -161,7 +163,7 @@ def display_sample_record(
         table = Table(title="Seed Columns", **table_kws)
         table.add_column("Name")
         table.add_column("Value")
-        for col in aidd_metadata.seed_columns:
+        for col in aidd_metadata.seed_columns and col not in aidd_metadata.drop_columns:
             table.add_row(col, _convert_to_row_element(record[col]))
         render_list.append(_pad_console_element(table))
 
@@ -176,7 +178,7 @@ def display_sample_record(
         table = Table(title="Generated Columns", **table_kws)
         table.add_column("Name")
         table.add_column("Value")
-        for col in [c for c in non_code_columns]:
+        for col in [c for c in non_code_columns if c not in aidd_metadata.drop_columns]:
             table.add_row(col, _convert_to_row_element(record[col]))
         render_list.append(_pad_console_element(table))
 
@@ -207,7 +209,11 @@ def display_sample_record(
     if len(aidd_metadata.validation_columns) > 0:
         table = Table(title="Validation", **table_kws)
         row = []
-        for col in aidd_metadata.validation_columns:
+        for col in [
+            c
+            for c in aidd_metadata.validation_columns
+            if c not in aidd_metadata.drop_columns
+        ]:
             value = record[col]
             if isinstance(value, numbers.Number):
                 table.add_column(col)
@@ -224,7 +230,11 @@ def display_sample_record(
         render_list.append(_pad_console_element(table, (1, 0, 1, 0)))
 
     if len(aidd_metadata.llm_judge_columns) > 0:
-        for col in aidd_metadata.llm_judge_columns:
+        for col in [
+            c
+            for c in aidd_metadata.llm_judge_columns
+            if c not in aidd_metadata.drop_columns
+        ]:
             table = Table(title=f"LLM-as-a-Judge: {col}", **table_kws)
             row = []
             judge = record[col]
diff --git a/tests/gretel_client/data_designer/test_data_designer.py b/tests/gretel_client/data_designer/test_data_designer.py
@@ -521,3 +521,46 @@ def test_get_column_from_kwargs():
     assert person_sampler_column_no_params.params.locale == "en_US"
     assert person_sampler_column_no_params.params.sex is None
     assert person_sampler_column_no_params.params.city is None
+
+
+def _minimal_designer(resource_provider):
+    """Helper to build a designer with at least one sampler column so workflow validation passes."""
+    dd = DataDesigner(gretel_resource_provider=resource_provider)
+    dd.add_column(name="uid", type="uuid", params={})
+    return dd
+
+
+def test_drop_flag_adds_dropcolumns_step(mock_low_level_sdk_resources):
+    dd = _minimal_designer(mock_low_level_sdk_resources.mock_resource_provider)
+    dd.add_column(
+        name="dude", type="category", params={"values": ["John", "Jane"]}, drop=True
+    )
+    dd.preview()
+
+    steps = [
+        c[2]["step"]
+        for c in mock_low_level_sdk_resources.mock_workflow_builder.add_step.mock_calls
+    ]
+    drop_step = next((s for s in steps if isinstance(s, DropColumns)), None)
+
+    assert drop_step is not None
+    assert drop_step.columns == ["dude"]
+
+
+def test_drop_flag_false_retains_column(mock_low_level_sdk_resources):
+    dd = _minimal_designer(mock_low_level_sdk_resources.mock_resource_provider)
+    dd.add_column(
+        name="dude",
+        type="category",
+        params={"values": ["John", "Jane"]},
+        drop=False,
+    )
+    dd.preview()
+
+    assert "dude" not in dd._drop_columns
+
+    steps = [
+        call[2]["step"]
+        for call in mock_low_level_sdk_resources.mock_workflow_builder.add_step.mock_calls
+    ]
+    assert next((s for s in steps if isinstance(s, DropColumns)), None) is None