From c27a2ac9c630e62796e5aa4d57e79678a3165775 Mon Sep 17 00:00:00 2001 From: malcoln-dandaro_data Date: Tue, 10 Mar 2026 13:58:01 -0300 Subject: [PATCH 1/2] feat: add table/column comments guidance to synthetic data gen skill Adds documentation for DDL-first and post-write approaches to set table and column comments when writing Delta tables to Unity Catalog. Co-Authored-By: Claude Opus 4.6 --- .../databricks-synthetic-data-gen/SKILL.md | 18 ++++++-- .../references/5-output-formats.md | 42 ++++++++++++++++++- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/databricks-skills/databricks-synthetic-data-gen/SKILL.md b/databricks-skills/databricks-synthetic-data-gen/SKILL.md index 5bd95e58..8e1d98c1 100644 --- a/databricks-skills/databricks-synthetic-data-gen/SKILL.md +++ b/databricks-skills/databricks-synthetic-data-gen/SKILL.md @@ -83,10 +83,19 @@ Show a clear specification with **YOUR ASSUMPTIONS surfaced**. Always start with Volume: /Volumes/{user_catalog}/ecommerce_demo/raw_data/ ``` -| Table | Columns | Rows | Key Assumptions | -|-------|---------|------|-----------------| -| customers | customer_id, name, email, tier, region | 5,000 | Tier: Free 60%, Pro 30%, Enterprise 10% | -| orders | order_id, customer_id (FK), amount, status | 15,000 | Enterprise customers generate 5x more orders | +| Table | Columns | Description | Rows | Key Assumptions | +|-------|---------|-------------|------|-----------------| +| customers | customer_id, name, email, tier, region | Synthetic customer profiles | 5,000 | Tier: Free 60%, Pro 30%, Enterprise 10% | +| orders | order_id, customer_id (FK), amount, status | Customer purchase transactions | 15,000 | Enterprise customers generate 5x more orders | + +Include column-level descriptions in the plan (these become column comments in Unity Catalog): + +| Table | Column | Comment | +|-------|--------|---------| +| customers | customer_id | Unique customer identifier (CUST-XXXXX) | +| customers | tier | Customer tier: Free, Pro, Enterprise | +| orders | customer_id | FK to customers.customer_id | +| orders | amount | Order total in USD | **Assumptions I'm making:** - Amount distribution: log-normal by tier (Enterprise ~$1800, Pro ~$245, Free ~$55) @@ -238,6 +247,7 @@ See [references/5-output-formats.md](references/5-output-formats.md) for detaile - Create infrastructure in script (`CREATE SCHEMA/VOLUME IF NOT EXISTS`) - Do NOT create catalogs - assume they exist - Delta tables as default +- Add table and column comments for discoverability in Unity Catalog (see [references/5-output-formats.md](references/5-output-formats.md)) ## Related Skills diff --git a/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md index c283a82c..21c347b2 100644 --- a/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md +++ b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md @@ -12,11 +12,11 @@ SCHEMA = "" VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" # Note: Assume catalog exists - do NOT create it -spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") +spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA} COMMENT 'Synthetic data for demo scenario'") spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data") ``` -**Important:** Do NOT create catalogs - assume they already exist. Only create schema and volume. +**Important:** Do NOT create catalogs - assume they already exist. Only create schema and volume. Always add a `COMMENT` to schemas describing the dataset purpose. --- @@ -126,6 +126,44 @@ customers_df.write \ - Skip the SDP bronze/silver/gold pipeline - Direct SQL analytics +### Adding Table and Column Comments + +Always add comments to Delta tables for discoverability in Unity Catalog. Prefer DDL-first approach — define the table with comments, then insert data. + +**DDL-first (preferred):** +```python +# Create table with inline column comments and table comment +spark.sql(f""" + CREATE TABLE IF NOT EXISTS {CATALOG}.{SCHEMA}.customers ( + customer_id STRING COMMENT 'Unique customer identifier (CUST-XXXXX)', + name STRING COMMENT 'Full customer name', + email STRING COMMENT 'Customer email address', + tier STRING COMMENT 'Customer tier: Free, Pro, Enterprise', + region STRING COMMENT 'Geographic region', + arr DOUBLE COMMENT 'Annual recurring revenue in USD' + ) + COMMENT 'Synthetic customer data for e-commerce demo' +""") + +# Then write data into the pre-defined table +customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers") +``` + +**Post-write (alternative):** +```python +# Write first, then add comments +customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers") + +# Add table comment +spark.sql(f"COMMENT ON TABLE {CATALOG}.{SCHEMA}.customers IS 'Synthetic customer data for e-commerce demo'") + +# Add column comments +spark.sql(f"ALTER TABLE {CATALOG}.{SCHEMA}.customers ALTER COLUMN customer_id COMMENT 'Unique customer identifier (CUST-XXXXX)'") +spark.sql(f"ALTER TABLE {CATALOG}.{SCHEMA}.customers ALTER COLUMN tier COMMENT 'Customer tier: Free, Pro, Enterprise'") +``` + +**Note:** Column/table comments only apply to Delta tables in Unity Catalog. Parquet/JSON/CSV files written to volumes do not support metadata comments. + --- ## Write Modes From 873f8b4ed4bc714536b471d3ff6ec415250de5f2 Mon Sep 17 00:00:00 2001 From: malcoln-dandaro_data Date: Tue, 10 Mar 2026 14:03:38 -0300 Subject: [PATCH 2/2] feat: add PySpark StructField metadata approach for column comments Co-Authored-By: Claude Opus 4.6 --- .../references/5-output-formats.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md index 21c347b2..5443ebd1 100644 --- a/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md +++ b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md @@ -149,6 +149,24 @@ spark.sql(f""" customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers") ``` +**PySpark schema with comments:** +```python +from pyspark.sql.types import StructType, StructField, StringType, DoubleType + +schema = StructType([ + StructField("customer_id", StringType(), True, metadata={"comment": "Unique customer identifier (CUST-XXXXX)"}), + StructField("name", StringType(), True, metadata={"comment": "Full customer name"}), + StructField("email", StringType(), True, metadata={"comment": "Customer email address"}), + StructField("tier", StringType(), True, metadata={"comment": "Customer tier: Free, Pro, Enterprise"}), + StructField("region", StringType(), True, metadata={"comment": "Geographic region"}), + StructField("arr", DoubleType(), True, metadata={"comment": "Annual recurring revenue in USD"}), +]) + +# Apply schema when creating the DataFrame, comments persist when saved as Delta +customers_df = spark.createDataFrame(data, schema) +customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers") +``` + **Post-write (alternative):** ```python # Write first, then add comments