Skip to content

Commit 13723ea

Browse files
author
shourya_singh_cs
committed
added the test case for params row-filter
1 parent 410f17e commit 13723ea

6 files changed

Lines changed: 406 additions & 3 deletions

File tree

configs/entities/test-data.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ entities:
1616
TEST_TABLE:
1717
source_database: BIGQUERY
1818
table_name: contact_details
19-
dataset_name: <your_bigquery_dataset_id>
20-
project_name: <your_gcp_project_id>
19+
dataset_name: clouddq_dev_us_central1
20+
project_name: kthxbayes-sandbox
2121
columns:
2222
ROW_ID:
2323
name: row_id

configs/reference_columns/reference-columns.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@ reference_columns:
2222

2323
INCLUDE_ALL_REFERENCE_COLUMNS:
2424
include_reference_columns:
25-
- *
25+
- "*"

tests/BUILD

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,3 +246,11 @@ py_test(
246246
legacy_create_init = 0,
247247
deps = DEPS,
248248
)
249+
250+
py_test(
251+
name = "test_dq_row_filter_params",
252+
srcs = SRCS,
253+
data = DATA,
254+
legacy_create_init = 0,
255+
deps = DEPS,
256+
)

tests/resources/configs/row_filters/row-filters.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,10 @@ row_filters:
2020
DATA_TYPE_EMAIL:
2121
filter_sql_expr: |-
2222
contact_type = 'email'
23+
24+
COLUMN_IN_VALUES_SET:
25+
params:
26+
- column
27+
- values
28+
filter_sql_expr: |-
29+
$column IN ($values)
Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
-- Copyright 2022 Google LLC
2+
--
3+
-- Licensed under the Apache License, Version 2.0 (the "License");
4+
-- you may not use this file except in compliance with the License.
5+
-- You may obtain a copy of the License at
6+
--
7+
-- http://www.apache.org/licenses/LICENSE-2.0
8+
--
9+
-- Unless required by applicable law or agreed to in writing, software
10+
-- distributed under the License is distributed on an "AS IS" BASIS,
11+
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
-- See the License for the specific language governing permissions and
13+
-- limitations under the License.
14+
15+
WITH
16+
zero_record AS (
17+
SELECT
18+
'<rule_binding_id>' AS rule_binding_id,
19+
),
20+
data AS (
21+
SELECT
22+
*,
23+
'<rule_binding_id>' AS rule_binding_id,
24+
FROM
25+
`<your-gcp-project-id>.<your_bigquery_dataset_id>.contact_details` d
26+
WHERE
27+
contact_type IN ('email')
28+
),
29+
last_mod AS (
30+
SELECT
31+
project_id || '.' || dataset_id || '.' || table_id AS table_id,
32+
TIMESTAMP_MILLIS(last_modified_time) AS last_modified
33+
FROM `<your-gcp-project-id>.<your_bigquery_dataset_id>.__TABLES__`
34+
),
35+
validation_results AS (
36+
SELECT
37+
CURRENT_TIMESTAMP() AS execution_ts,
38+
'<rule_binding_id>' AS rule_binding_id,
39+
'NO_DUPLICATES_IN_COLUMN_GROUPS' AS rule_id,
40+
'<your-gcp-project-id>.<your_bigquery_dataset_id>.contact_details' AS table_id,
41+
CAST(NULL AS STRING) AS column_id,
42+
NULL AS column_value,
43+
CAST(NULL AS STRING) AS dimension,
44+
CAST(NULL AS BOOLEAN) AS simple_rule_row_is_valid,
45+
TRUE AS skip_null_count,
46+
custom_sql_statement_validation_errors.complex_rule_validation_errors_count AS complex_rule_validation_errors_count,
47+
CASE
48+
WHEN custom_sql_statement_validation_errors.complex_rule_validation_errors_count IS NULL THEN CAST(NULL AS BOOLEAN)
49+
WHEN custom_sql_statement_validation_errors.complex_rule_validation_errors_count = 0 THEN TRUE
50+
ELSE FALSE
51+
END AS complex_rule_validation_success_flag,
52+
r"""
53+
WITH
54+
zero_record AS (
55+
SELECT
56+
'<rule_binding_id>' AS rule_binding_id,
57+
),
58+
data AS (
59+
SELECT
60+
*,
61+
'<rule_binding_id>' AS rule_binding_id,
62+
FROM
63+
`<your-gcp-project-id>.<your_bigquery_dataset_id>.contact_details` d
64+
WHERE
65+
contact_type IN ('email')
66+
),
67+
last_mod AS (
68+
SELECT
69+
project_id || '.' || dataset_id || '.' || table_id AS table_id,
70+
TIMESTAMP_MILLIS(last_modified_time) AS last_modified
71+
FROM `<your-gcp-project-id>.<your_bigquery_dataset_id>.__TABLES__`
72+
),
73+
validation_results AS (SELECT
74+
'<rule_binding_id>' AS rule_binding_id,
75+
'NO_DUPLICATES_IN_COLUMN_GROUPS' AS rule_id,
76+
'<your-gcp-project-id>.<your_bigquery_dataset_id>.contact_details' AS table_id,
77+
CAST(NULL AS STRING) AS column_id,
78+
NULL AS column_value,
79+
custom_sql_statement_validation_errors,
80+
CAST(NULL AS STRING) AS dimension,
81+
CAST(NULL AS BOOLEAN) AS simple_rule_row_is_valid,
82+
TRUE AS skip_null_count,
83+
custom_sql_statement_validation_errors.complex_rule_validation_errors_count AS complex_rule_validation_errors_count,
84+
CASE
85+
WHEN custom_sql_statement_validation_errors.complex_rule_validation_errors_count IS NULL THEN CAST(NULL AS BOOLEAN)
86+
WHEN custom_sql_statement_validation_errors.complex_rule_validation_errors_count = 0 THEN TRUE
87+
ELSE FALSE
88+
END AS complex_rule_validation_success_flag,
89+
FROM
90+
zero_record
91+
LEFT JOIN
92+
(
93+
SELECT
94+
*,
95+
'<rule_binding_id>' AS _rule_binding_id,
96+
COUNT(*) OVER() AS complex_rule_validation_errors_count,
97+
FROM (
98+
select a.*
99+
from data a
100+
inner join (
101+
select
102+
contact_type,value
103+
from data
104+
group by contact_type,value
105+
having count(*) > 1
106+
) duplicates
107+
using (contact_type,value)
108+
) custom_sql
109+
) custom_sql_statement_validation_errors
110+
ON
111+
zero_record.rule_binding_id = custom_sql_statement_validation_errors._rule_binding_id
112+
),
113+
all_validation_results AS (
114+
SELECT
115+
'{{ invocation_id }}' as _dq_validation_invocation_id,
116+
r.rule_binding_id AS _dq_validation_rule_binding_id,
117+
r.rule_id AS _dq_validation_rule_id,
118+
r.column_id AS _dq_validation_column_id,
119+
r.column_value AS _dq_validation_column_value,
120+
CAST(r.dimension AS STRING) AS _dq_validation_dimension,
121+
r.simple_rule_row_is_valid AS _dq_validation_simple_rule_row_is_valid,
122+
r.complex_rule_validation_errors_count AS _dq_validation_complex_rule_validation_errors_count,
123+
r.complex_rule_validation_success_flag AS _dq_validation_complex_rule_validation_success_flag,
124+
r.custom_sql_statement_validation_errors,
125+
FROM
126+
validation_results r
127+
)
128+
SELECT
129+
*
130+
FROM
131+
all_validation_results
132+
WHERE
133+
_dq_validation_simple_rule_row_is_valid is False
134+
OR
135+
_dq_validation_complex_rule_validation_success_flag is False
136+
ORDER BY _dq_validation_rule_id"""
137+
AS failed_records_query,
138+
FROM
139+
zero_record
140+
LEFT JOIN
141+
(
142+
SELECT
143+
*,
144+
'<rule_binding_id>' AS _rule_binding_id,
145+
COUNT(*) OVER() AS complex_rule_validation_errors_count,
146+
FROM (
147+
select a.*
148+
from data a
149+
inner join (
150+
select
151+
contact_type,value
152+
from data
153+
group by contact_type,value
154+
having count(*) > 1
155+
) duplicates
156+
using (contact_type,value)
157+
) custom_sql
158+
) custom_sql_statement_validation_errors
159+
ON
160+
zero_record.rule_binding_id = custom_sql_statement_validation_errors._rule_binding_id
161+
UNION ALL
162+
SELECT
163+
CURRENT_TIMESTAMP() AS execution_ts,
164+
'<rule_binding_id>' AS rule_binding_id,
165+
'NOT_NULL_SIMPLE' AS rule_id,
166+
'<your-gcp-project-id>.<your_bigquery_dataset_id>.contact_details' AS table_id,
167+
'value' AS column_id,
168+
data.value AS column_value,
169+
CAST(NULL AS STRING) AS dimension,
170+
CASE
171+
WHEN value IS NOT NULL THEN TRUE
172+
ELSE
173+
FALSE
174+
END AS simple_rule_row_is_valid,
175+
TRUE AS skip_null_count,
176+
CAST(NULL AS INT64) AS complex_rule_validation_errors_count,
177+
CAST(NULL AS BOOLEAN) AS complex_rule_validation_success_flag,
178+
r"""
179+
WITH
180+
zero_record AS (
181+
SELECT
182+
'<rule_binding_id>' AS rule_binding_id,
183+
),
184+
data AS (
185+
SELECT
186+
*,
187+
'<rule_binding_id>' AS rule_binding_id,
188+
FROM
189+
`<your-gcp-project-id>.<your_bigquery_dataset_id>.contact_details` d
190+
WHERE
191+
contact_type IN ('email')
192+
),
193+
last_mod AS (
194+
SELECT
195+
project_id || '.' || dataset_id || '.' || table_id AS table_id,
196+
TIMESTAMP_MILLIS(last_modified_time) AS last_modified
197+
FROM `<your-gcp-project-id>.<your_bigquery_dataset_id>.__TABLES__`
198+
),
199+
validation_results AS (SELECT
200+
'<rule_binding_id>' AS rule_binding_id,
201+
'NOT_NULL_SIMPLE' AS rule_id,
202+
'<your-gcp-project-id>.<your_bigquery_dataset_id>.contact_details' AS table_id,
203+
'value' AS column_id,
204+
data.value AS column_value,
205+
data.row_id AS row_id,
206+
data.contact_type AS contact_type,
207+
data.value AS value,
208+
CAST(NULL AS STRING) AS dimension,
209+
CASE
210+
WHEN value IS NOT NULL THEN TRUE
211+
ELSE
212+
FALSE
213+
END AS simple_rule_row_is_valid,
214+
TRUE AS skip_null_count,
215+
CAST(NULL AS INT64) AS complex_rule_validation_errors_count,
216+
CAST(NULL AS BOOLEAN) AS complex_rule_validation_success_flag,
217+
FROM
218+
zero_record
219+
LEFT JOIN
220+
data
221+
ON
222+
zero_record.rule_binding_id = data.rule_binding_id
223+
),
224+
all_validation_results AS (
225+
SELECT
226+
'{{ invocation_id }}' as _dq_validation_invocation_id,
227+
r.rule_binding_id AS _dq_validation_rule_binding_id,
228+
r.rule_id AS _dq_validation_rule_id,
229+
r.column_id AS _dq_validation_column_id,
230+
r.column_value AS _dq_validation_column_value,
231+
CAST(r.dimension AS STRING) AS _dq_validation_dimension,
232+
r.simple_rule_row_is_valid AS _dq_validation_simple_rule_row_is_valid,
233+
r.complex_rule_validation_errors_count AS _dq_validation_complex_rule_validation_errors_count,
234+
r.complex_rule_validation_success_flag AS _dq_validation_complex_rule_validation_success_flag,
235+
r.row_id AS row_id,
236+
r.contact_type AS contact_type,
237+
r.value AS value,
238+
FROM
239+
validation_results r
240+
)
241+
SELECT
242+
*
243+
FROM
244+
all_validation_results
245+
WHERE
246+
_dq_validation_simple_rule_row_is_valid is False
247+
OR
248+
_dq_validation_complex_rule_validation_success_flag is False
249+
ORDER BY _dq_validation_rule_id"""
250+
AS failed_records_query,
251+
FROM
252+
zero_record
253+
LEFT JOIN
254+
data
255+
ON
256+
zero_record.rule_binding_id = data.rule_binding_id
257+
),
258+
all_validation_results AS (
259+
SELECT
260+
r.execution_ts AS execution_ts,
261+
r.rule_binding_id AS rule_binding_id,
262+
r.rule_id AS rule_id,
263+
r.table_id AS table_id,
264+
r.column_id AS column_id,
265+
r.column_value AS column_value,
266+
CAST(r.dimension AS STRING) AS dimension,
267+
r.skip_null_count AS skip_null_count,
268+
r.simple_rule_row_is_valid AS simple_rule_row_is_valid,
269+
r.complex_rule_validation_errors_count AS complex_rule_validation_errors_count,
270+
r.complex_rule_validation_success_flag AS complex_rule_validation_success_flag,
271+
(SELECT COUNT(*) FROM data) AS rows_validated,
272+
last_mod.last_modified,
273+
'{"brand": "one"}' AS metadata_json_string,
274+
'' AS configs_hashsum,
275+
'<your_dataplex_lake_id>' AS dataplex_lake,
276+
'<your_dataplex_zone_id>' AS dataplex_zone,
277+
'<your_dataplex_asset_id>' AS dataplex_asset_id,
278+
CONCAT(r.rule_binding_id, '_', r.rule_id, '_', r.execution_ts, '_', True) AS dq_run_id,
279+
TRUE AS progress_watermark,
280+
failed_records_query AS failed_records_query,
281+
FROM
282+
validation_results r
283+
JOIN last_mod USING(table_id)
284+
)
285+
SELECT
286+
*
287+
FROM
288+
all_validation_results

0 commit comments

Comments
 (0)