Skip to content

Commit e45de70

Browse files
committed
Pass csv parameters during duckdb connection
1 parent e636ede commit e45de70

4 files changed

Lines changed: 289 additions & 32 deletions

File tree

datacontract/engines/soda/connections/duckdb.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,45 @@ def get_duckdb_connection(data_contract, server, run: Run):
4242
elif server.format == "csv":
4343
columns = to_csv_types(model)
4444
run.log_info("Using columns: " + str(columns))
45-
if columns is None:
46-
con.sql(
47-
f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1);"""
48-
)
49-
else:
50-
con.sql(
51-
f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});"""
52-
)
45+
46+
# Start with the required parameter.
47+
params = ["hive_partitioning=1"]
48+
49+
# Define a mapping for CSV parameters: server attribute -> read_csv parameter name.
50+
param_mapping = {
51+
"delimiter": "delim", # Map server.delimiter to 'delim'
52+
"header": "header",
53+
"escape": "escape",
54+
"all_varchar": "all_varchar",
55+
"allow_quoted_nulls": "allow_quoted_nulls",
56+
"dateformat": "dateformat",
57+
"decimal_separator": "decimal_separator",
58+
"new_line": "new_line",
59+
"timestampformat": "timestampformat",
60+
"quote": "quote",
61+
}
62+
for server_attr, read_csv_param in param_mapping.items():
63+
value = getattr(server, server_attr, None)
64+
if value is not None:
65+
# Wrap string values in quotes.
66+
if isinstance(value, str):
67+
params.append(f"{read_csv_param}='{value}'")
68+
else:
69+
params.append(f"{read_csv_param}={value}")
70+
71+
# Add columns if they exist.
72+
if columns is not None:
73+
params.append(f"columns={columns}")
74+
75+
# Build the parameter string.
76+
params_str = ", ".join(params)
77+
78+
# Create the view with the assembled parameters.
79+
con.sql(f"""
80+
CREATE VIEW "{model_name}" AS
81+
SELECT * FROM read_csv('{model_path}', {params_str});
82+
""")
83+
5384
elif server.format == "delta":
5485
con.sql("update extensions;") # Make sure we have the latest delta extension
5586
con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")

datacontract/model/data_contract_specification.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,15 @@ class Server(pyd.BaseModel):
5858
dataset: str | None = None
5959
path: str | None = None
6060
delimiter: str | None = None
61+
header: bool | None = None
62+
escape: str | None = None
63+
all_varchar: bool | None = None
64+
allow_quoted_nulls: bool | None = None
65+
dateformat: str | None = None
66+
decimal_separator: str | None = None
67+
new_line: str | None = None
68+
timestampformat: str | None = None
69+
quote: str | None = None
6170
endpointUrl: str | None = None
6271
location: str | None = None
6372
account: str | None = None

datacontract/schemas/datacontract-1.1.0.schema.json

Lines changed: 121 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1212,11 +1212,48 @@
12121212
},
12131213
"delimiter": {
12141214
"type": "string",
1215-
"enum": [
1216-
"new_line",
1217-
"array"
1215+
"anyOf": [
1216+
{ "enum": ["new_line", "array"] },
1217+
{ "pattern": "^.$" }
12181218
],
1219-
"description": "Only for format = json. How multiple json documents are delimited within one file"
1219+
"description": "For JSON format, only 'new_line' or 'array' is allowed to indicate how multiple JSON documents are delimited. For CSV format, any single character can be used as the delimiter between columns. Only valid for CSV."
1220+
},
1221+
"header": {
1222+
"type": "boolean",
1223+
"description": "Indicates whether the first row in the CSV file should be treated as column headers. Only valid for CSV."
1224+
},
1225+
"escape": {
1226+
"type": "string",
1227+
"description": "Specifies the escape character used in the CSV file to include special characters in fields. Only valid for CSV."
1228+
},
1229+
"all_varchar": {
1230+
"type": "boolean",
1231+
"description": "If true, all CSV columns are read as VARCHAR (strings), bypassing type inference. Only valid for CSV."
1232+
},
1233+
"allow_quoted_nulls": {
1234+
"type": "boolean",
1235+
"description": "If true, quoted 'NULL' values in the CSV are interpreted as SQL NULL rather than as the string 'NULL'. Only valid for CSV."
1236+
},
1237+
"dateformat": {
1238+
"type": "string",
1239+
"description": "A format string (e.g., '%Y-%m-%d') used to parse date values in the CSV. Only valid for CSV."
1240+
},
1241+
"decimal_separator": {
1242+
"type": "string",
1243+
"description": "The character used as the decimal separator in numeric CSV values (e.g., '.' or ','). Only valid for CSV."
1244+
},
1245+
"new_line": {
1246+
"type": "string",
1247+
"description": "The newline character(s) used in the CSV file (e.g., '\\n' or '\\r\\n'). Only valid for CSV."
1248+
},
1249+
"timestampformat": {
1250+
"type": "string",
1251+
"description": "A format string (e.g., '%Y-%m-%d %H:%M:%S') used to parse timestamp values in the CSV. Only valid for CSV."
1252+
},
1253+
"quote": {
1254+
"type": "string",
1255+
"description": "The character used for quoting fields in the CSV file (e.g., '\"'). Only valid for CSV."
1256+
}
12201257
}
12211258
},
12221259
"required": [
@@ -1248,11 +1285,47 @@
12481285
},
12491286
"delimiter": {
12501287
"type": "string",
1251-
"enum": [
1252-
"new_line",
1253-
"array"
1288+
"anyOf": [
1289+
{ "enum": ["new_line", "array"] },
1290+
{ "pattern": "^.$" }
12541291
],
1255-
"description": "Only for format = json. How multiple json documents are delimited within one file"
1292+
"description": "For JSON format, only 'new_line' or 'array' is allowed to indicate how multiple JSON documents are delimited. For CSV format, any single character can be used as the delimiter between columns. Only valid for CSV."
1293+
},
1294+
"header": {
1295+
"type": "boolean",
1296+
"description": "Indicates whether the first row in the CSV file should be treated as column headers. Only valid for CSV."
1297+
},
1298+
"escape": {
1299+
"type": "string",
1300+
"description": "Specifies the escape character used in the CSV file to include special characters in fields. Only valid for CSV."
1301+
},
1302+
"all_varchar": {
1303+
"type": "boolean",
1304+
"description": "If true, all CSV columns are read as VARCHAR (strings), bypassing type inference. Only valid for CSV."
1305+
},
1306+
"allow_quoted_nulls": {
1307+
"type": "boolean",
1308+
"description": "If true, quoted 'NULL' values in the CSV are interpreted as SQL NULL rather than as the string 'NULL'. Only valid for CSV."
1309+
},
1310+
"dateformat": {
1311+
"type": "string",
1312+
"description": "A format string (e.g., '%Y-%m-%d') used to parse date values in the CSV. Only valid for CSV."
1313+
},
1314+
"decimal_separator": {
1315+
"type": "string",
1316+
"description": "The character used as the decimal separator in numeric CSV values (e.g., '.' or ','). Only valid for CSV."
1317+
},
1318+
"new_line": {
1319+
"type": "string",
1320+
"description": "The newline character(s) used in the CSV file (e.g., '\\n' or '\\r\\n'). Only valid for CSV."
1321+
},
1322+
"timestampformat": {
1323+
"type": "string",
1324+
"description": "A format string (e.g., '%Y-%m-%d %H:%M:%S') used to parse timestamp values in the CSV. Only valid for CSV."
1325+
},
1326+
"quote": {
1327+
"type": "string",
1328+
"description": "The character used for quoting fields in the CSV file (e.g., '\"'). Only valid for CSV."
12561329
}
12571330
},
12581331
"required": [
@@ -1336,11 +1409,47 @@
13361409
},
13371410
"delimiter": {
13381411
"type": "string",
1339-
"enum": [
1340-
"new_line",
1341-
"array"
1412+
"anyOf": [
1413+
{ "enum": ["new_line", "array"] },
1414+
{ "pattern": "^.$" }
13421415
],
1343-
"description": "Only for format = json. How multiple json documents are delimited within one file"
1416+
"description": "For JSON format, only 'new_line' or 'array' is allowed to indicate how multiple JSON documents are delimited. For CSV format, any single character can be used as the delimiter between columns. Only valid for CSV."
1417+
},
1418+
"header": {
1419+
"type": "boolean",
1420+
"description": "Indicates whether the first row in the CSV file should be treated as column headers. Only valid for CSV."
1421+
},
1422+
"escape": {
1423+
"type": "string",
1424+
"description": "Specifies the escape character used in the CSV file to include special characters in fields. Only valid for CSV."
1425+
},
1426+
"all_varchar": {
1427+
"type": "boolean",
1428+
"description": "If true, all CSV columns are read as VARCHAR (strings), bypassing type inference. Only valid for CSV."
1429+
},
1430+
"allow_quoted_nulls": {
1431+
"type": "boolean",
1432+
"description": "If true, quoted 'NULL' values in the CSV are interpreted as SQL NULL rather than as the string 'NULL'. Only valid for CSV."
1433+
},
1434+
"dateformat": {
1435+
"type": "string",
1436+
"description": "A format string (e.g., '%Y-%m-%d') used to parse date values in the CSV. Only valid for CSV."
1437+
},
1438+
"decimal_separator": {
1439+
"type": "string",
1440+
"description": "The character used as the decimal separator in numeric CSV values (e.g., '.' or ','). Only valid for CSV."
1441+
},
1442+
"new_line": {
1443+
"type": "string",
1444+
"description": "The newline character(s) used in the CSV file (e.g., '\\n' or '\\r\\n'). Only valid for CSV."
1445+
},
1446+
"timestampformat": {
1447+
"type": "string",
1448+
"description": "A format string (e.g., '%Y-%m-%d %H:%M:%S') used to parse timestamp values in the CSV. Only valid for CSV."
1449+
},
1450+
"quote": {
1451+
"type": "string",
1452+
"description": "The character used for quoting fields in the CSV file (e.g., '\"'). Only valid for CSV."
13441453
}
13451454
},
13461455
"required": [

datacontract/schemas/odcs-3.0.1.schema.json

Lines changed: 120 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -776,11 +776,47 @@
776776
},
777777
"delimiter": {
778778
"type": "string",
779-
"enum": [
780-
"new_line",
781-
"array"
779+
"anyOf": [
780+
{ "enum": ["new_line", "array"] },
781+
{ "pattern": "^.$" }
782782
],
783-
"description": "Only for format = json. How multiple json documents are delimited within one file"
783+
"description": "For JSON format, only 'new_line' or 'array' is allowed to indicate how multiple JSON documents are delimited. For CSV format, any single character can be used as the delimiter between columns. Only valid for CSV."
784+
},
785+
"header": {
786+
"type": "boolean",
787+
"description": "Indicates whether the first row in the CSV file should be treated as column headers. Only valid for CSV."
788+
},
789+
"escape": {
790+
"type": "string",
791+
"description": "Specifies the escape character used in the CSV file to include special characters in fields. Only valid for CSV."
792+
},
793+
"all_varchar": {
794+
"type": "boolean",
795+
"description": "If true, all CSV columns are read as VARCHAR (strings), bypassing type inference. Only valid for CSV."
796+
},
797+
"allow_quoted_nulls": {
798+
"type": "boolean",
799+
"description": "If true, quoted 'NULL' values in the CSV are interpreted as SQL NULL rather than as the string 'NULL'. Only valid for CSV."
800+
},
801+
"dateformat": {
802+
"type": "string",
803+
"description": "A format string (e.g., '%Y-%m-%d') used to parse date values in the CSV. Only valid for CSV."
804+
},
805+
"decimal_separator": {
806+
"type": "string",
807+
"description": "The character used as the decimal separator in numeric CSV values (e.g., '.' or ','). Only valid for CSV."
808+
},
809+
"new_line": {
810+
"type": "string",
811+
"description": "The newline character(s) used in the CSV file (e.g., '\\n' or '\\r\\n'). Only valid for CSV."
812+
},
813+
"timestampformat": {
814+
"type": "string",
815+
"description": "A format string (e.g., '%Y-%m-%d %H:%M:%S') used to parse timestamp values in the CSV. Only valid for CSV."
816+
},
817+
"quote": {
818+
"type": "string",
819+
"description": "The character used for quoting fields in the CSV file (e.g., '\"'). Only valid for CSV."
784820
}
785821
},
786822
"required": [
@@ -1381,11 +1417,47 @@
13811417
},
13821418
"delimiter": {
13831419
"type": "string",
1384-
"enum": [
1385-
"new_line",
1386-
"array"
1420+
"anyOf": [
1421+
{ "enum": ["new_line", "array"] },
1422+
{ "pattern": "^.$" }
13871423
],
1388-
"description": "Only for format = json. How multiple json documents are delimited within one file"
1424+
"description": "For JSON format, only 'new_line' or 'array' is allowed to indicate how multiple JSON documents are delimited. For CSV format, any single character can be used as the delimiter between columns. Only valid for CSV."
1425+
},
1426+
"header": {
1427+
"type": "boolean",
1428+
"description": "Indicates whether the first row in the CSV file should be treated as column headers. Only valid for CSV."
1429+
},
1430+
"escape": {
1431+
"type": "string",
1432+
"description": "Specifies the escape character used in the CSV file to include special characters in fields. Only valid for CSV."
1433+
},
1434+
"all_varchar": {
1435+
"type": "boolean",
1436+
"description": "If true, all CSV columns are read as VARCHAR (strings), bypassing type inference. Only valid for CSV."
1437+
},
1438+
"allow_quoted_nulls": {
1439+
"type": "boolean",
1440+
"description": "If true, quoted 'NULL' values in the CSV are interpreted as SQL NULL rather than as the string 'NULL'. Only valid for CSV."
1441+
},
1442+
"dateformat": {
1443+
"type": "string",
1444+
"description": "A format string (e.g., '%Y-%m-%d') used to parse date values in the CSV. Only valid for CSV."
1445+
},
1446+
"decimal_separator": {
1447+
"type": "string",
1448+
"description": "The character used as the decimal separator in numeric CSV values (e.g., '.' or ','). Only valid for CSV."
1449+
},
1450+
"new_line": {
1451+
"type": "string",
1452+
"description": "The newline character(s) used in the CSV file (e.g., '\\n' or '\\r\\n'). Only valid for CSV."
1453+
},
1454+
"timestampformat": {
1455+
"type": "string",
1456+
"description": "A format string (e.g., '%Y-%m-%d %H:%M:%S') used to parse timestamp values in the CSV. Only valid for CSV."
1457+
},
1458+
"quote": {
1459+
"type": "string",
1460+
"description": "The character used for quoting fields in the CSV file (e.g., '\"'). Only valid for CSV."
13891461
}
13901462
},
13911463
"required": [
@@ -1417,11 +1489,47 @@
14171489
},
14181490
"delimiter": {
14191491
"type": "string",
1420-
"enum": [
1421-
"new_line",
1422-
"array"
1492+
"anyOf": [
1493+
{ "enum": ["new_line", "array"] },
1494+
{ "pattern": "^.$" }
14231495
],
1424-
"description": "Only for format = json. How multiple json documents are delimited within one file"
1496+
"description": "For JSON format, only 'new_line' or 'array' is allowed to indicate how multiple JSON documents are delimited. For CSV format, any single character can be used as the delimiter between columns. Only valid for CSV."
1497+
},
1498+
"header": {
1499+
"type": "boolean",
1500+
"description": "Indicates whether the first row in the CSV file should be treated as column headers. Only valid for CSV."
1501+
},
1502+
"escape": {
1503+
"type": "string",
1504+
"description": "Specifies the escape character used in the CSV file to include special characters in fields. Only valid for CSV."
1505+
},
1506+
"all_varchar": {
1507+
"type": "boolean",
1508+
"description": "If true, all CSV columns are read as VARCHAR (strings), bypassing type inference. Only valid for CSV."
1509+
},
1510+
"allow_quoted_nulls": {
1511+
"type": "boolean",
1512+
"description": "If true, quoted 'NULL' values in the CSV are interpreted as SQL NULL rather than as the string 'NULL'. Only valid for CSV."
1513+
},
1514+
"dateformat": {
1515+
"type": "string",
1516+
"description": "A format string (e.g., '%Y-%m-%d') used to parse date values in the CSV. Only valid for CSV."
1517+
},
1518+
"decimal_separator": {
1519+
"type": "string",
1520+
"description": "The character used as the decimal separator in numeric CSV values (e.g., '.' or ','). Only valid for CSV."
1521+
},
1522+
"new_line": {
1523+
"type": "string",
1524+
"description": "The newline character(s) used in the CSV file (e.g., '\\n' or '\\r\\n'). Only valid for CSV."
1525+
},
1526+
"timestampformat": {
1527+
"type": "string",
1528+
"description": "A format string (e.g., '%Y-%m-%d %H:%M:%S') used to parse timestamp values in the CSV. Only valid for CSV."
1529+
},
1530+
"quote": {
1531+
"type": "string",
1532+
"description": "The character used for quoting fields in the CSV file (e.g., '\"'). Only valid for CSV."
14251533
}
14261534
},
14271535
"required": [

0 commit comments

Comments
 (0)