From c27a2ac9c630e62796e5aa4d57e79678a3165775 Mon Sep 17 00:00:00 2001
From: malcoln-dandaro_data <dmalcoln@gmail.com>
Date: Tue, 10 Mar 2026 13:58:01 -0300
Subject: [PATCH 1/2] feat: add table/column comments guidance to synthetic
 data gen skill

Adds documentation for DDL-first and post-write approaches to set
table and column comments when writing Delta tables to Unity Catalog.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../databricks-synthetic-data-gen/SKILL.md    | 18 ++++++--
 .../references/5-output-formats.md            | 42 ++++++++++++++++++-
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/databricks-skills/databricks-synthetic-data-gen/SKILL.md b/databricks-skills/databricks-synthetic-data-gen/SKILL.md
index 5bd95e58..8e1d98c1 100644
--- a/databricks-skills/databricks-synthetic-data-gen/SKILL.md
+++ b/databricks-skills/databricks-synthetic-data-gen/SKILL.md
@@ -83,10 +83,19 @@ Show a clear specification with **YOUR ASSUMPTIONS surfaced**. Always start with
    Volume: /Volumes/{user_catalog}/ecommerce_demo/raw_data/
 ```
 
-| Table | Columns | Rows | Key Assumptions |
-|-------|---------|------|-----------------|
-| customers | customer_id, name, email, tier, region | 5,000 | Tier: Free 60%, Pro 30%, Enterprise 10% |
-| orders | order_id, customer_id (FK), amount, status | 15,000 | Enterprise customers generate 5x more orders |
+| Table | Columns | Description | Rows | Key Assumptions |
+|-------|---------|-------------|------|-----------------|
+| customers | customer_id, name, email, tier, region | Synthetic customer profiles | 5,000 | Tier: Free 60%, Pro 30%, Enterprise 10% |
+| orders | order_id, customer_id (FK), amount, status | Customer purchase transactions | 15,000 | Enterprise customers generate 5x more orders |
+
+Include column-level descriptions in the plan (these become column comments in Unity Catalog):
+
+| Table | Column | Comment |
+|-------|--------|---------|
+| customers | customer_id | Unique customer identifier (CUST-XXXXX) |
+| customers | tier | Customer tier: Free, Pro, Enterprise |
+| orders | customer_id | FK to customers.customer_id |
+| orders | amount | Order total in USD |
 
 **Assumptions I'm making:**
 - Amount distribution: log-normal by tier (Enterprise ~$1800, Pro ~$245, Free ~$55)
@@ -238,6 +247,7 @@ See [references/5-output-formats.md](references/5-output-formats.md) for detaile
 - Create infrastructure in script (`CREATE SCHEMA/VOLUME IF NOT EXISTS`)
 - Do NOT create catalogs - assume they exist
 - Delta tables as default
+- Add table and column comments for discoverability in Unity Catalog (see [references/5-output-formats.md](references/5-output-formats.md))
 
 ## Related Skills
 
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md
index c283a82c..21c347b2 100644
--- a/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md
+++ b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md
@@ -12,11 +12,11 @@ SCHEMA = "<user-provided-schema>"
 VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
 
 # Note: Assume catalog exists - do NOT create it
-spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA} COMMENT 'Synthetic data for demo scenario'")
 spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
 ```
 
-**Important:** Do NOT create catalogs - assume they already exist. Only create schema and volume.
+**Important:** Do NOT create catalogs - assume they already exist. Only create schema and volume. Always add a `COMMENT` to schemas describing the dataset purpose.
 
 ---
 
@@ -126,6 +126,44 @@ customers_df.write \
 - Skip the SDP bronze/silver/gold pipeline
 - Direct SQL analytics
 
+### Adding Table and Column Comments
+
+Always add comments to Delta tables for discoverability in Unity Catalog. Prefer DDL-first approach — define the table with comments, then insert data.
+
+**DDL-first (preferred):**
+```python
+# Create table with inline column comments and table comment
+spark.sql(f"""
+    CREATE TABLE IF NOT EXISTS {CATALOG}.{SCHEMA}.customers (
+        customer_id STRING COMMENT 'Unique customer identifier (CUST-XXXXX)',
+        name STRING COMMENT 'Full customer name',
+        email STRING COMMENT 'Customer email address',
+        tier STRING COMMENT 'Customer tier: Free, Pro, Enterprise',
+        region STRING COMMENT 'Geographic region',
+        arr DOUBLE COMMENT 'Annual recurring revenue in USD'
+    )
+    COMMENT 'Synthetic customer data for e-commerce demo'
+""")
+
+# Then write data into the pre-defined table
+customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+```
+
+**Post-write (alternative):**
+```python
+# Write first, then add comments
+customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+
+# Add table comment
+spark.sql(f"COMMENT ON TABLE {CATALOG}.{SCHEMA}.customers IS 'Synthetic customer data for e-commerce demo'")
+
+# Add column comments
+spark.sql(f"ALTER TABLE {CATALOG}.{SCHEMA}.customers ALTER COLUMN customer_id COMMENT 'Unique customer identifier (CUST-XXXXX)'")
+spark.sql(f"ALTER TABLE {CATALOG}.{SCHEMA}.customers ALTER COLUMN tier COMMENT 'Customer tier: Free, Pro, Enterprise'")
+```
+
+**Note:** Column/table comments only apply to Delta tables in Unity Catalog. Parquet/JSON/CSV files written to volumes do not support metadata comments.
+
 ---
 
 ## Write Modes

From 873f8b4ed4bc714536b471d3ff6ec415250de5f2 Mon Sep 17 00:00:00 2001
From: malcoln-dandaro_data <dmalcoln@gmail.com>
Date: Tue, 10 Mar 2026 14:03:38 -0300
Subject: [PATCH 2/2] feat: add PySpark StructField metadata approach for
 column comments

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../references/5-output-formats.md             | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md
index 21c347b2..5443ebd1 100644
--- a/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md
+++ b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md
@@ -149,6 +149,24 @@ spark.sql(f"""
 customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
 ```
 
+**PySpark schema with comments:**
+```python
+from pyspark.sql.types import StructType, StructField, StringType, DoubleType
+
+schema = StructType([
+    StructField("customer_id", StringType(), True, metadata={"comment": "Unique customer identifier (CUST-XXXXX)"}),
+    StructField("name", StringType(), True, metadata={"comment": "Full customer name"}),
+    StructField("email", StringType(), True, metadata={"comment": "Customer email address"}),
+    StructField("tier", StringType(), True, metadata={"comment": "Customer tier: Free, Pro, Enterprise"}),
+    StructField("region", StringType(), True, metadata={"comment": "Geographic region"}),
+    StructField("arr", DoubleType(), True, metadata={"comment": "Annual recurring revenue in USD"}),
+])
+
+# Apply schema when creating the DataFrame, comments persist when saved as Delta
+customers_df = spark.createDataFrame(data, schema)
+customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+```
+
 **Post-write (alternative):**
 ```python
 # Write first, then add comments