Pass csv parameters during duckdb connection

Halpph · Halpph · commit e45de70c8c71 · 2025-02-20T12:28:21.000+01:00
diff --git a/datacontract/engines/soda/connections/duckdb.py b/datacontract/engines/soda/connections/duckdb.py
@@ -42,14 +42,45 @@ def get_duckdb_connection(data_contract, server, run: Run):
         elif server.format == "csv":
             columns = to_csv_types(model)
             run.log_info("Using columns: " + str(columns))
-            if columns is None:
-                con.sql(
-                    f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1);"""
-                )
-            else:
-                con.sql(
-                    f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});"""
-                )
+
+            # Start with the required parameter.
+            params = ["hive_partitioning=1"]
+
+            # Define a mapping for CSV parameters: server attribute -> read_csv parameter name.
+            param_mapping = {
+                "delimiter": "delim",  # Map server.delimiter to 'delim'
+                "header": "header",
+                "escape": "escape",
+                "all_varchar": "all_varchar",
+                "allow_quoted_nulls": "allow_quoted_nulls",
+                "dateformat": "dateformat",
+                "decimal_separator": "decimal_separator",
+                "new_line": "new_line",
+                "timestampformat": "timestampformat",
+                "quote": "quote",
+            }
+            for server_attr, read_csv_param in param_mapping.items():
+                value = getattr(server, server_attr, None)
+                if value is not None:
+                    # Wrap string values in quotes.
+                    if isinstance(value, str):
+                        params.append(f"{read_csv_param}='{value}'")
+                    else:
+                        params.append(f"{read_csv_param}={value}")
+
+            # Add columns if they exist.
+            if columns is not None:
+                params.append(f"columns={columns}")
+
+            # Build the parameter string.
+            params_str = ", ".join(params)
+
+            # Create the view with the assembled parameters.
+            con.sql(f"""
+                CREATE VIEW "{model_name}" AS
+                SELECT * FROM read_csv('{model_path}', {params_str});
+            """)
+
         elif server.format == "delta":
             con.sql("update extensions;")  # Make sure we have the latest delta extension
             con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")
diff --git a/datacontract/model/data_contract_specification.py b/datacontract/model/data_contract_specification.py
@@ -58,6 +58,15 @@ class Server(pyd.BaseModel):
     dataset: str | None = None
     path: str | None = None
     delimiter: str | None = None
+    header: bool | None = None
+    escape: str | None = None
+    all_varchar: bool | None = None
+    allow_quoted_nulls: bool | None = None
+    dateformat: str | None = None
+    decimal_separator: str | None = None
+    new_line: str | None = None
+    timestampformat: str | None = None
+    quote: str | None = None
     endpointUrl: str | None = None
     location: str | None = None
     account: str | None = None
diff --git a/datacontract/schemas/datacontract-1.1.0.schema.json b/datacontract/schemas/datacontract-1.1.0.schema.json
@@ -1212,11 +1212,48 @@
         },
         "delimiter": {
           "type": "string",
-          "enum": [
-            "new_line",
-            "array"
+          "anyOf": [
+            { "enum": ["new_line", "array"] },
+            { "pattern": "^.$" }
           ],
-          "description": "Only for format = json. How multiple json documents are delimited within one file"
+          "description": "For JSON format, only 'new_line' or 'array' is allowed to indicate how multiple JSON documents are delimited. For CSV format, any single character can be used as the delimiter between columns. Only valid for CSV."
+          },
+          "header": {
+            "type": "boolean",
+            "description": "Indicates whether the first row in the CSV file should be treated as column headers. Only valid for CSV."
+          },
+          "escape": {
+            "type": "string",
+            "description": "Specifies the escape character used in the CSV file to include special characters in fields. Only valid for CSV."
+          },
+          "all_varchar": {
+            "type": "boolean",
+            "description": "If true, all CSV columns are read as VARCHAR (strings), bypassing type inference. Only valid for CSV."
+          },
+          "allow_quoted_nulls": {
+            "type": "boolean",
+            "description": "If true, quoted 'NULL' values in the CSV are interpreted as SQL NULL rather than as the string 'NULL'. Only valid for CSV."
+          },
+          "dateformat": {
+            "type": "string",
+            "description": "A format string (e.g., '%Y-%m-%d') used to parse date values in the CSV. Only valid for CSV."
+          },
+          "decimal_separator": {
+            "type": "string",
+            "description": "The character used as the decimal separator in numeric CSV values (e.g., '.' or ','). Only valid for CSV."
+          },
+          "new_line": {
+            "type": "string",
+            "description": "The newline character(s) used in the CSV file (e.g., '\\n' or '\\r\\n'). Only valid for CSV."
+          },
+          "timestampformat": {
+            "type": "string",
+            "description": "A format string (e.g., '%Y-%m-%d %H:%M:%S') used to parse timestamp values in the CSV. Only valid for CSV."
+          },
+          "quote": {
+            "type": "string",
+            "description": "The character used for quoting fields in the CSV file (e.g., '\"'). Only valid for CSV."
+          }
         }
       },
       "required": [
@@ -1248,11 +1285,47 @@
         },
         "delimiter": {
           "type": "string",
-          "enum": [
-            "new_line",
-            "array"
+          "anyOf": [
+            { "enum": ["new_line", "array"] },
+            { "pattern": "^.$" }
           ],
-          "description": "Only for format = json. How multiple json documents are delimited within one file"
+          "description": "For JSON format, only 'new_line' or 'array' is allowed to indicate how multiple JSON documents are delimited. For CSV format, any single character can be used as the delimiter between columns. Only valid for CSV."
+        },
+        "header": {
+          "type": "boolean",
+          "description": "Indicates whether the first row in the CSV file should be treated as column headers. Only valid for CSV."
+        },
+        "escape": {
+          "type": "string",
+          "description": "Specifies the escape character used in the CSV file to include special characters in fields. Only valid for CSV."
+        },
+        "all_varchar": {
+          "type": "boolean",
+          "description": "If true, all CSV columns are read as VARCHAR (strings), bypassing type inference. Only valid for CSV."
+        },
+        "allow_quoted_nulls": {
+          "type": "boolean",
+          "description": "If true, quoted 'NULL' values in the CSV are interpreted as SQL NULL rather than as the string 'NULL'. Only valid for CSV."
+        },
+        "dateformat": {
+          "type": "string",
+          "description": "A format string (e.g., '%Y-%m-%d') used to parse date values in the CSV. Only valid for CSV."
+        },
+        "decimal_separator": {
+          "type": "string",
+          "description": "The character used as the decimal separator in numeric CSV values (e.g., '.' or ','). Only valid for CSV."
+        },
+        "new_line": {
+          "type": "string",
+          "description": "The newline character(s) used in the CSV file (e.g., '\\n' or '\\r\\n'). Only valid for CSV."
+        },
+        "timestampformat": {
+          "type": "string",
+          "description": "A format string (e.g., '%Y-%m-%d %H:%M:%S') used to parse timestamp values in the CSV. Only valid for CSV."
+        },
+        "quote": {
+          "type": "string",
+          "description": "The character used for quoting fields in the CSV file (e.g., '\"'). Only valid for CSV."
         }
       },
       "required": [
@@ -1336,11 +1409,47 @@
         },
         "delimiter": {
           "type": "string",
-          "enum": [
-            "new_line",
-            "array"
+          "anyOf": [
+            { "enum": ["new_line", "array"] },
+            { "pattern": "^.$" }
           ],
-          "description": "Only for format = json. How multiple json documents are delimited within one file"
+          "description": "For JSON format, only 'new_line' or 'array' is allowed to indicate how multiple JSON documents are delimited. For CSV format, any single character can be used as the delimiter between columns. Only valid for CSV."
+        },
+        "header": {
+          "type": "boolean",
+          "description": "Indicates whether the first row in the CSV file should be treated as column headers. Only valid for CSV."
+        },
+        "escape": {
+          "type": "string",
+          "description": "Specifies the escape character used in the CSV file to include special characters in fields. Only valid for CSV."
+        },
+        "all_varchar": {
+          "type": "boolean",
+          "description": "If true, all CSV columns are read as VARCHAR (strings), bypassing type inference. Only valid for CSV."
+        },
+        "allow_quoted_nulls": {
+          "type": "boolean",
+          "description": "If true, quoted 'NULL' values in the CSV are interpreted as SQL NULL rather than as the string 'NULL'. Only valid for CSV."
+        },
+        "dateformat": {
+          "type": "string",
+          "description": "A format string (e.g., '%Y-%m-%d') used to parse date values in the CSV. Only valid for CSV."
+        },
+        "decimal_separator": {
+          "type": "string",
+          "description": "The character used as the decimal separator in numeric CSV values (e.g., '.' or ','). Only valid for CSV."
+        },
+        "new_line": {
+          "type": "string",
+          "description": "The newline character(s) used in the CSV file (e.g., '\\n' or '\\r\\n'). Only valid for CSV."
+        },
+        "timestampformat": {
+          "type": "string",
+          "description": "A format string (e.g., '%Y-%m-%d %H:%M:%S') used to parse timestamp values in the CSV. Only valid for CSV."
+        },
+        "quote": {
+          "type": "string",
+          "description": "The character used for quoting fields in the CSV file (e.g., '\"'). Only valid for CSV."
         }
       },
       "required": [
diff --git a/datacontract/schemas/odcs-3.0.1.schema.json b/datacontract/schemas/odcs-3.0.1.schema.json
@@ -776,11 +776,47 @@
           },
           "delimiter": {
             "type": "string",
-            "enum": [
-              "new_line",
-              "array"
+            "anyOf": [
+              { "enum": ["new_line", "array"] },
+              { "pattern": "^.$" }
             ],
-            "description": "Only for format = json. How multiple json documents are delimited within one file"
+            "description": "For JSON format, only 'new_line' or 'array' is allowed to indicate how multiple JSON documents are delimited. For CSV format, any single character can be used as the delimiter between columns. Only valid for CSV."
+          },
+          "header": {
+            "type": "boolean",
+            "description": "Indicates whether the first row in the CSV file should be treated as column headers. Only valid for CSV."
+          },
+          "escape": {
+            "type": "string",
+            "description": "Specifies the escape character used in the CSV file to include special characters in fields. Only valid for CSV."
+          },
+          "all_varchar": {
+            "type": "boolean",
+            "description": "If true, all CSV columns are read as VARCHAR (strings), bypassing type inference. Only valid for CSV."
+          },
+          "allow_quoted_nulls": {
+            "type": "boolean",
+            "description": "If true, quoted 'NULL' values in the CSV are interpreted as SQL NULL rather than as the string 'NULL'. Only valid for CSV."
+          },
+          "dateformat": {
+            "type": "string",
+            "description": "A format string (e.g., '%Y-%m-%d') used to parse date values in the CSV. Only valid for CSV."
+          },
+          "decimal_separator": {
+            "type": "string",
+            "description": "The character used as the decimal separator in numeric CSV values (e.g., '.' or ','). Only valid for CSV."
+          },
+          "new_line": {
+            "type": "string",
+            "description": "The newline character(s) used in the CSV file (e.g., '\\n' or '\\r\\n'). Only valid for CSV."
+          },
+          "timestampformat": {
+            "type": "string",
+            "description": "A format string (e.g., '%Y-%m-%d %H:%M:%S') used to parse timestamp values in the CSV. Only valid for CSV."
+          },
+          "quote": {
+            "type": "string",
+            "description": "The character used for quoting fields in the CSV file (e.g., '\"'). Only valid for CSV."
           }
         },
         "required": [
@@ -1381,11 +1417,47 @@
           },
           "delimiter": {
             "type": "string",
-            "enum": [
-              "new_line",
-              "array"
+            "anyOf": [
+              { "enum": ["new_line", "array"] },
+              { "pattern": "^.$" }
             ],
-            "description": "Only for format = json. How multiple json documents are delimited within one file"
+            "description": "For JSON format, only 'new_line' or 'array' is allowed to indicate how multiple JSON documents are delimited. For CSV format, any single character can be used as the delimiter between columns. Only valid for CSV."
+          },
+          "header": {
+            "type": "boolean",
+            "description": "Indicates whether the first row in the CSV file should be treated as column headers. Only valid for CSV."
+          },
+          "escape": {
+            "type": "string",
+            "description": "Specifies the escape character used in the CSV file to include special characters in fields. Only valid for CSV."
+          },
+          "all_varchar": {
+            "type": "boolean",
+            "description": "If true, all CSV columns are read as VARCHAR (strings), bypassing type inference. Only valid for CSV."
+          },
+          "allow_quoted_nulls": {
+            "type": "boolean",
+            "description": "If true, quoted 'NULL' values in the CSV are interpreted as SQL NULL rather than as the string 'NULL'. Only valid for CSV."
+          },
+          "dateformat": {
+            "type": "string",
+            "description": "A format string (e.g., '%Y-%m-%d') used to parse date values in the CSV. Only valid for CSV."
+          },
+          "decimal_separator": {
+            "type": "string",
+            "description": "The character used as the decimal separator in numeric CSV values (e.g., '.' or ','). Only valid for CSV."
+          },
+          "new_line": {
+            "type": "string",
+            "description": "The newline character(s) used in the CSV file (e.g., '\\n' or '\\r\\n'). Only valid for CSV."
+          },
+          "timestampformat": {
+            "type": "string",
+            "description": "A format string (e.g., '%Y-%m-%d %H:%M:%S') used to parse timestamp values in the CSV. Only valid for CSV."
+          },
+          "quote": {
+            "type": "string",
+            "description": "The character used for quoting fields in the CSV file (e.g., '\"'). Only valid for CSV."
           }
         },
         "required": [
@@ -1417,11 +1489,47 @@
           },
           "delimiter": {
             "type": "string",
-            "enum": [
-              "new_line",
-              "array"
+            "anyOf": [
+              { "enum": ["new_line", "array"] },
+              { "pattern": "^.$" }
             ],
-            "description": "Only for format = json. How multiple json documents are delimited within one file"
+            "description": "For JSON format, only 'new_line' or 'array' is allowed to indicate how multiple JSON documents are delimited. For CSV format, any single character can be used as the delimiter between columns. Only valid for CSV."
+          },
+          "header": {
+            "type": "boolean",
+            "description": "Indicates whether the first row in the CSV file should be treated as column headers. Only valid for CSV."
+          },
+          "escape": {
+            "type": "string",
+            "description": "Specifies the escape character used in the CSV file to include special characters in fields. Only valid for CSV."
+          },
+          "all_varchar": {
+            "type": "boolean",
+            "description": "If true, all CSV columns are read as VARCHAR (strings), bypassing type inference. Only valid for CSV."
+          },
+          "allow_quoted_nulls": {
+            "type": "boolean",
+            "description": "If true, quoted 'NULL' values in the CSV are interpreted as SQL NULL rather than as the string 'NULL'. Only valid for CSV."
+          },
+          "dateformat": {
+            "type": "string",
+            "description": "A format string (e.g., '%Y-%m-%d') used to parse date values in the CSV. Only valid for CSV."
+          },
+          "decimal_separator": {
+            "type": "string",
+            "description": "The character used as the decimal separator in numeric CSV values (e.g., '.' or ','). Only valid for CSV."
+          },
+          "new_line": {
+            "type": "string",
+            "description": "The newline character(s) used in the CSV file (e.g., '\\n' or '\\r\\n'). Only valid for CSV."
+          },
+          "timestampformat": {
+            "type": "string",
+            "description": "A format string (e.g., '%Y-%m-%d %H:%M:%S') used to parse timestamp values in the CSV. Only valid for CSV."
+          },
+          "quote": {
+            "type": "string",
+            "description": "The character used for quoting fields in the CSV file (e.g., '\"'). Only valid for CSV."
           }
         },
         "required": [