From 0fa6c2e0783c02cd62270f48157c2f3d17d7d551 Mon Sep 17 00:00:00 2001 From: Marton Vago Date: Wed, 26 Nov 2025 15:46:39 +0000 Subject: [PATCH 1/5] feat: :sparkles: check primary key exists --- src/check_datapackage/check.py | 78 +++++++++++++++++++++++++++++++++- tests/test_check.py | 63 ++++++++++++++++++++++----- 2 files changed, 130 insertions(+), 11 deletions(-) diff --git a/src/check_datapackage/check.py b/src/check_datapackage/check.py index 49db8600..b0b481ab 100644 --- a/src/check_datapackage/check.py +++ b/src/check_datapackage/check.py @@ -3,8 +3,9 @@ from dataclasses import dataclass, field from functools import reduce from types import TracebackType -from typing import Any, Callable, Iterator, Optional +from typing import Any, Callable, Iterator, Optional, cast +from jsonpath import findall, resolve from jsonschema import Draft7Validator, FormatChecker, ValidationError from check_datapackage.config import Config @@ -16,8 +17,10 @@ from check_datapackage.exclusion import exclude from check_datapackage.extensions import apply_extensions from check_datapackage.internals import ( + PropertyField, _filter, _flat_map, + _get_fields_at_jsonpath, _map, ) from check_datapackage.issue import Issue @@ -87,6 +90,7 @@ class for more details, especially about the default values. _set_should_fields_to_required(schema) issues = _check_object_against_json_schema(properties, schema) + issues += _check_keys(properties, _map(issues, lambda issue: issue.jsonpath)) issues += apply_extensions(properties, config.extensions) issues = exclude(issues, config.exclusions, properties) issues = sorted(set(issues)) @@ -97,6 +101,78 @@ class for more details, especially about the default values. return issues +def _check_keys(properties: dict[str, Any], issue_jsonpaths: list[str]) -> list[Issue]: + """Check that primary and foreign keys exist.""" + # Primary keys + resources_with_pk = _get_fields_at_jsonpath( + "$.resources[?(length(@.schema.primaryKey) > 0 || @.schema.primaryKey == '')]", + properties, + ) + resources_with_pk = _only_resources_with_correct_property_at( + "schema.primaryKey", resources_with_pk, issue_jsonpaths + ) + issues = _flat_map(resources_with_pk, _check_primary_key) + + # Foreign keys + + return issues + + +def _only_resources_with_correct_property_at( + jsonpath: str, resources: list[PropertyField], issue_jsonpaths: list[str] +) -> list[PropertyField]: + """Filter out resources that have an error at or under the given `jsonpath`.""" + return _filter( + resources, + lambda resource: not _filter( + issue_jsonpaths, + lambda path: f"{resource.jsonpath}.{jsonpath}" in path, + ), + ) + + +def _check_primary_key(resource: PropertyField) -> list[Issue]: + """Check that primary key fields exist in the resource.""" + pk_fields = _key_fields_as_str_list(resolve("/schema/primaryKey", resource.value)) + unknown_fields = _get_unknown_key_fields(pk_fields, resource.value) + + if not unknown_fields: + return [] + + return [ + Issue( + jsonpath=f"{resource.jsonpath}.schema.primaryKey", + type="primary-key", + message=( + f"No fields found in resource for primary key fields: {unknown_fields}." + ), + ) + ] + + +def _key_fields_as_str_list(key_fields: Any) -> list[str]: + """Returns the list representation of primary and foreign key fields. + + Key fields can be represented either as a string (containing one field name) + or a list of strings. + + The input should contain a correctly typed `key_fields` object. + """ + if not isinstance(key_fields, list): + key_fields = [key_fields] + return cast(list[str], key_fields) + + +def _get_unknown_key_fields( + key_fields: list[str], properties: dict[str, Any], resource_path: str = "" +) -> str: + """Return the key fields that don't exist on the specified resource.""" + known_fields = findall(f"{resource_path}schema.fields[*].name", properties) + unknown_fields = _filter(key_fields, lambda field: field not in known_fields) + unknown_fields = _map(unknown_fields, lambda field: f"'{field}'") + return ", ".join(unknown_fields) + + def _set_should_fields_to_required(schema: dict[str, Any]) -> dict[str, Any]: """Set 'SHOULD' fields to 'REQUIRED' in the schema.""" should_fields = ("name", "id", "licenses") diff --git a/tests/test_check.py b/tests/test_check.py index 1038d91e..68e6e450 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -91,6 +91,59 @@ def test_fails_properties_with_pattern_mismatch(): assert issues[0].jsonpath == "$.contributors[0].path" +@mark.parametrize("primary_key", ["id", ["id", "name"]]) +def test_pass_good_primary_key(primary_key): + properties = example_package_properties() + properties["resources"][0]["schema"]["primaryKey"] = primary_key + properties["resources"][0]["schema"]["fields"].extend( + [ + {"name": "id", "type": "integer"}, + {"name": "name", "type": "string"}, + ] + ) + + issues = check(properties) + + assert issues == [] + + +@mark.parametrize("primary_key", ["", "last_name", ["first_name", "last_name"]]) +def test_fail_primary_key_with_unknown_fields(primary_key): + properties = example_package_properties() + properties["resources"][0]["schema"]["primaryKey"] = primary_key + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].jsonpath == "$.resources[0].schema.primaryKey" + assert issues[0].type == "primary-key" + + +@mark.parametrize("primary_key", [None, 123, [], [123, "a_field"]]) +def test_do_not_check_bad_primary_key_against_fields(primary_key): + properties = example_package_properties() + properties["resources"][0]["schema"]["primaryKey"] = primary_key + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].type != "primary-key" + + +def test_do_not_check_primary_key_against_bad_field(): + properties = example_package_properties() + properties["resources"][0]["schema"]["primaryKey"] = "eye-colour" + properties["resources"][0]["schema"]["fields"].append( + # Bad name + {"name": 123, "type": "integer"}, + ) + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].type != "primary-key" + + # "SHOULD" checks @@ -586,16 +639,6 @@ def test_fail_foreign_keys_with_bad_array_item(): ) -@mark.parametrize("primary_key", ["id", ["name", "address"]]) -def test_pass_good_primary_key(primary_key): - properties = example_package_properties() - properties["resources"][0]["schema"]["primaryKey"] = primary_key - - issues = check(properties) - - assert issues == [] - - def test_fail_primary_key_of_bad_type(): properties = example_package_properties() properties["resources"][0]["schema"]["primaryKey"] = 123 From f03c11d6d377c786314037b49482cb1f5921749c Mon Sep 17 00:00:00 2001 From: Marton Vago Date: Thu, 27 Nov 2025 14:32:45 +0000 Subject: [PATCH 2/5] refactor: :recycle: use repr instead of str --- src/check_datapackage/check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/check_datapackage/check.py b/src/check_datapackage/check.py index ea627ecd..9fc26d93 100644 --- a/src/check_datapackage/check.py +++ b/src/check_datapackage/check.py @@ -208,7 +208,7 @@ def _get_unknown_key_fields( """Return the key fields that don't exist on the specified resource.""" known_fields = findall(f"{resource_path}schema.fields[*].name", properties) unknown_fields = _filter(key_fields, lambda field: field not in known_fields) - unknown_fields = _map(unknown_fields, lambda field: f"'{field}'") + unknown_fields = _map(unknown_fields, lambda field: f"{field!r}") return ", ".join(unknown_fields) From 45ef20ae57de64ff7179e1441a6face90ad8c775 Mon Sep 17 00:00:00 2001 From: Marton Vago Date: Mon, 1 Dec 2025 10:47:12 +0000 Subject: [PATCH 3/5] fix: :bug: specify instance when creating issue --- src/check_datapackage/check.py | 6 ++++-- tests/test_check.py | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/check_datapackage/check.py b/src/check_datapackage/check.py index 9fc26d93..1abc7712 100644 --- a/src/check_datapackage/check.py +++ b/src/check_datapackage/check.py @@ -172,8 +172,9 @@ def _only_resources_with_correct_property_at( def _check_primary_key(resource: PropertyField) -> list[Issue]: """Check that primary key fields exist in the resource.""" - pk_fields = _key_fields_as_str_list(resolve("/schema/primaryKey", resource.value)) - unknown_fields = _get_unknown_key_fields(pk_fields, resource.value) + pk_fields = resolve("/schema/primaryKey", resource.value) + pk_fields_list = _key_fields_as_str_list(pk_fields) + unknown_fields = _get_unknown_key_fields(pk_fields_list, resource.value) if not unknown_fields: return [] @@ -185,6 +186,7 @@ def _check_primary_key(resource: PropertyField) -> list[Issue]: message=( f"No fields found in resource for primary key fields: {unknown_fields}." ), + instance=pk_fields, ) ] diff --git a/tests/test_check.py b/tests/test_check.py index 26bb218c..14388ee3 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -117,6 +117,7 @@ def test_fail_primary_key_with_unknown_fields(primary_key): assert len(issues) == 1 assert issues[0].jsonpath == "$.resources[0].schema.primaryKey" assert issues[0].type == "primary-key" + assert issues[0].instance == primary_key @mark.parametrize("primary_key", [None, 123, [], [123, "a_field"]]) From c9f1d0547ecbd6994b214e3b451d9cbee970145e Mon Sep 17 00:00:00 2001 From: Marton Vago Date: Fri, 5 Dec 2025 09:22:51 +0000 Subject: [PATCH 4/5] refactor: :recycle: refactor _keep_resources_with_no_issue_at_property --- src/check_datapackage/check.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/check_datapackage/check.py b/src/check_datapackage/check.py index 1abc7712..a6348969 100644 --- a/src/check_datapackage/check.py +++ b/src/check_datapackage/check.py @@ -129,7 +129,7 @@ class for more details, especially about the default values. _set_should_fields_to_required(schema) issues = _check_object_against_json_schema(properties, schema) - issues += _check_keys(properties, _map(issues, lambda issue: issue.jsonpath)) + issues += _check_keys(properties, issues) issues += apply_extensions(properties, config.extensions) issues = exclude(issues, config.exclusions, properties) issues = sorted(set(issues)) @@ -140,15 +140,15 @@ class for more details, especially about the default values. return issues -def _check_keys(properties: dict[str, Any], issue_jsonpaths: list[str]) -> list[Issue]: +def _check_keys(properties: dict[str, Any], issues: list[Issue]) -> list[Issue]: """Check that primary and foreign keys exist.""" # Primary keys resources_with_pk = _get_fields_at_jsonpath( "$.resources[?(length(@.schema.primaryKey) > 0 || @.schema.primaryKey == '')]", properties, ) - resources_with_pk = _only_resources_with_correct_property_at( - "schema.primaryKey", resources_with_pk, issue_jsonpaths + resources_with_pk = _keep_resources_with_no_issue_at_property( + resources_with_pk, issues, "schema.primaryKey" ) issues = _flat_map(resources_with_pk, _check_primary_key) @@ -157,16 +157,22 @@ def _check_keys(properties: dict[str, Any], issue_jsonpaths: list[str]) -> list[ return issues -def _only_resources_with_correct_property_at( - jsonpath: str, resources: list[PropertyField], issue_jsonpaths: list[str] +def _issues_at_property( + resource: PropertyField, issues: list[Issue], jsonpath: str +) -> list[Issue]: + return _filter( + issues, + lambda issue: f"{resource.jsonpath}.{jsonpath}" in issue.jsonpath, + ) + + +def _keep_resources_with_no_issue_at_property( + resources: list[PropertyField], issues: list[Issue], jsonpath: str ) -> list[PropertyField]: - """Filter out resources that have an error at or under the given `jsonpath`.""" + """Filter out resources that have an issue at or under the given `jsonpath`.""" return _filter( resources, - lambda resource: not _filter( - issue_jsonpaths, - lambda path: f"{resource.jsonpath}.{jsonpath}" in path, - ), + lambda resource: not _issues_at_property(resource, issues, jsonpath), ) From 3f229c057b7af276a56eb817e1158ca0eb784a1d Mon Sep 17 00:00:00 2001 From: Marton Vago Date: Fri, 5 Dec 2025 10:28:41 +0000 Subject: [PATCH 5/5] fix: :bug: fix clashing var names --- src/check_datapackage/check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/check_datapackage/check.py b/src/check_datapackage/check.py index a6348969..843e2ccd 100644 --- a/src/check_datapackage/check.py +++ b/src/check_datapackage/check.py @@ -150,11 +150,11 @@ def _check_keys(properties: dict[str, Any], issues: list[Issue]) -> list[Issue]: resources_with_pk = _keep_resources_with_no_issue_at_property( resources_with_pk, issues, "schema.primaryKey" ) - issues = _flat_map(resources_with_pk, _check_primary_key) + key_issues = _flat_map(resources_with_pk, _check_primary_key) # Foreign keys - return issues + return key_issues def _issues_at_property(