datatk/config-example.toml at main · nathanthorell/datatk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
######################################################################################
[datatk]
logging_level = "summary" # "verbose", "errors_only", or "summary"

######################################################################################
[object_compare]
schema = "report"
database = "ExampleDb" # Optional, switches from whatever db is in conn to this db
db_type = "mssql" # Optional, defaults to mssql, accepts mssql, postgres, or databricks
object_types = [
    "stored_proc",
    "view",
    "function",
    "table",
    "trigger",
    "sequence",
    "index",
    "type",
    "extension",      # PostgreSQL only
    "external_table", # MSSQL only
    "foreign_key",
]

[object_compare.environments]
DEV = "DEV_DB"
QA = "QA_DB"
TEST = "TEST_DB"
STAGE = "STAGE_DB"
PROD = "PROD_DB"

######################################################################################
[proc_tester]
schema = "report"
conn = "DEV_DB"
database = "ExampleDb" # Optional, switches from whatever db is in conn to this db

[proc_tester.defaults]
start_date = "2024-01-01"
end_date = "2024-01-31"
start_datetime = "2023-01-01 00:00:00"
end_datetime = "2023-01-31 23:59:59"
integer = 100
bit = 1
decimal = 0.0
varchar = "12345"

######################################################################################
[view_tester]
schema = "report"
conn = "DEV_DB"
database = "ExampleDb" # Optional, switches from whatever db is in conn to this db

######################################################################################
[export_parquet]
conn = "DEV_DB"
database = "ExampleDb" # Optional, switches from whatever db is in conn to this db
data_dir = "./data/"
batch_size = 10000

# List of objects to export
# Each object has:
#   - name: Friendly name for the output file (without .parquet extension)
#   - object: SQL object name in format "schema.object" or just "object" (defaults to dbo)
#   - filter: Optional SQL WHERE clause to filter data
[[export_parquet.objects]]
name = "customers"
object = "dbo.Customers"
filter = "IsActive = 1"

[[export_parquet.objects]]
name = "orders"
object = "sales.vw_OrderDetails"
filter = "OrderDate >= '2023-01-01'"

[[export_parquet.objects]]
name = "products"
object = "inventory.Products"

# Example with default dbo schema (will use dbo.Employees)
[[export_parquet.objects]]
name = "employees"
object = "Employees"
filter = "Department = 'Engineering'"

######################################################################################
# Schema Size
# Analyze multiple environments to show schema-level aggregates
[schema_size]
mode = "summary"  # "summary" or "detail"
db_type = "mssql"  # "mssql" or "postgres"

# Detail Mode options (ignored in Summary Mode)
sort_by = "data_size"  # data_size, row_count, index_size, total_size
top_n = 50

[[schema_size.environments]]
name = "DEV"
conn = "DEV_DB"
databases = ["DevDB1", "DevDB2", "DevDB3"]

[[schema_size.environments]]
name = "QA"
conn = "QA_DB"
databases = ["QaDB1", "QaDB2"]

[[schema_size.environments]]
name = "TEST"
conn = "TEST_DB"
databases = ["TestDB1", "TestDB2"]

[[schema_size.environments]]
name = "PROD"
conn = "PROD_DB"
databases = ["ProdDB1", "ProdDB2", "ProdDB3"]

# Detail Mode Example
# Deep dive into specific schema, shows table-level breakdown
# [schema_size]
# mode = "detail"
# db_type = "mssql"
# sort_by = "data_size"
# top_n = 50

# [[schema_size.environments]]
# name = "DEV Analysis"
# conn = "DEV_DB"
# databases = ["DevDB1"]
# schemas = ["dbo"]

######################################################################################
[data_compare]
output_type = "right_only"  # "left_only", "right_only", "common", "differences", or "all"
output_file_path = "./output/"
output_format = "csv"  # "csv" or "json" or "sql"
timestamp_file = false  # Whether to include timestamp in filename
max_sql_in_values = 10000
case_insensitive = false  # ignore string case when comparing; can be overridden per comparison
show_performance = true   # Show query execution progress and timing

[[data_compare.compare_list]]
name = "Example Comparison"
left_connection = "DEV_DB"
right_connection = "QA_DB"
left_db_type = "mssql"
right_db_type = "mssql"
schema_name = "dbo"      # Optional schema name for SQL output format
table_name = "my_table"  # Optional table name for SQL output format
# case_insensitive = true  # Override global default for this comparison
left_query = """
SELECT 1 AS column_one, 2 AS column_two
"""
right_query = """
SELECT 1 AS column_one, 2 AS column_two
"""

# Example comparing MSSQL to PostgreSQL
[[data_compare.compare_list]]
name = "MSSQL to PostgreSQL"
left_connection = "DEV_DB"
right_connection = "PG_DEV_DB"
left_db_type = "mssql"
right_db_type = "postgres"
left_query = """
SELECT id, name FROM dbo.Users
"""
right_query = """
SELECT id, name FROM public.users
"""

# Example comparing MSSQL to Databricks
[[data_compare.compare_list]]
name = "MSSQL to Databricks"
left_connection = "DEV_DB"
right_connection = "DBX_DEV"
left_db_type = "mssql"
right_db_type = "databricks"
left_query = """
SELECT id, name FROM dbo.Users
"""
right_query = """
SELECT id, name FROM default.users
"""

# Example comparing a DB query to a local file (parquet, csv, or json)
[[data_compare.compare_list]]
name = "DB vs File"
left_connection = "DEV_DB"
left_db_type = "mssql"
right_db_type = "file"   # right_query becomes the file path; no right_connection needed
left_query = """
SELECT id, name FROM dbo.Users
"""
right_query = "./data/users.parquet"

######################################################################################
[data_cleanup]
conn = "DEV_DB"
database = "YourDatabaseName"
schema = "dbo"
table = "YourTableName"
cleanup_mode = "summary" # "summary" or "execute"
batch_size = 1000 # Number of records to process in each batch
batch_threshold = 3000 # Minimum records before batching is applied
name = "MyDb Deleted Data Cleanup" # optional name for cleaner file output

# Tables to temporarily disable foreign key constraints for during deletion
# Format: ["schema.table", "schema.table2"]
# This is optional, but can improve performance for tables with many FKs
disable_foreign_keys_for_tables = [
    "dbo.HighVolumeTable",
    "dbo.AnotherTableWithManyFKs"
]

# SQL query that returns the primary key values to remove
query_of_data_to_remove = """
SELECT id FROM dbo.YourTableName
WHERE DeleteFlag = 1
"""

######################################################################################
[db_diagram]
conn = "DEV_DB"
database = ""  # Optional: override database from connection string

schema = "dbo"
column_mode = "all"  # "all", "keys_only", or "none"
diagram_format = "dbml"  # "dbml", "mermaid", or "plantuml"
output_file = "database_erd"
output_directory = "./output/diagrams"

# Hierarchical diagram options (optional)
scope = "schema"                 # "schema" (entire schema) or "hierarchy" (focused on base table)
# base_table = "Invoice"         # Required when scope = "hierarchy"
# hierarchy_direction = "both"   # "up", "down", or "both" - direction to traverse relationships
# hierarchy_max_depth = 3        # Optional limit for large hierarchies