diff --git a/src/check_datapackage/check.py b/src/check_datapackage/check.py index 2eeb328e..843e2ccd 100644 --- a/src/check_datapackage/check.py +++ b/src/check_datapackage/check.py @@ -3,8 +3,9 @@ from dataclasses import dataclass, field from functools import reduce from types import TracebackType -from typing import Any, Callable, Iterator, Optional +from typing import Any, Callable, Iterator, Optional, cast +from jsonpath import findall, resolve from jsonschema import Draft7Validator, FormatChecker, ValidationError from check_datapackage.config import Config @@ -16,8 +17,10 @@ from check_datapackage.exclusion import exclude from check_datapackage.extensions import apply_extensions from check_datapackage.internals import ( + PropertyField, _filter, _flat_map, + _get_fields_at_jsonpath, _map, ) from check_datapackage.issue import Issue @@ -126,6 +129,7 @@ class for more details, especially about the default values. _set_should_fields_to_required(schema) issues = _check_object_against_json_schema(properties, schema) + issues += _check_keys(properties, issues) issues += apply_extensions(properties, config.extensions) issues = exclude(issues, config.exclusions, properties) issues = sorted(set(issues)) @@ -136,6 +140,86 @@ class for more details, especially about the default values. return issues +def _check_keys(properties: dict[str, Any], issues: list[Issue]) -> list[Issue]: + """Check that primary and foreign keys exist.""" + # Primary keys + resources_with_pk = _get_fields_at_jsonpath( + "$.resources[?(length(@.schema.primaryKey) > 0 || @.schema.primaryKey == '')]", + properties, + ) + resources_with_pk = _keep_resources_with_no_issue_at_property( + resources_with_pk, issues, "schema.primaryKey" + ) + key_issues = _flat_map(resources_with_pk, _check_primary_key) + + # Foreign keys + + return key_issues + + +def _issues_at_property( + resource: PropertyField, issues: list[Issue], jsonpath: str +) -> list[Issue]: + return _filter( + issues, + lambda issue: f"{resource.jsonpath}.{jsonpath}" in issue.jsonpath, + ) + + +def _keep_resources_with_no_issue_at_property( + resources: list[PropertyField], issues: list[Issue], jsonpath: str +) -> list[PropertyField]: + """Filter out resources that have an issue at or under the given `jsonpath`.""" + return _filter( + resources, + lambda resource: not _issues_at_property(resource, issues, jsonpath), + ) + + +def _check_primary_key(resource: PropertyField) -> list[Issue]: + """Check that primary key fields exist in the resource.""" + pk_fields = resolve("/schema/primaryKey", resource.value) + pk_fields_list = _key_fields_as_str_list(pk_fields) + unknown_fields = _get_unknown_key_fields(pk_fields_list, resource.value) + + if not unknown_fields: + return [] + + return [ + Issue( + jsonpath=f"{resource.jsonpath}.schema.primaryKey", + type="primary-key", + message=( + f"No fields found in resource for primary key fields: {unknown_fields}." + ), + instance=pk_fields, + ) + ] + + +def _key_fields_as_str_list(key_fields: Any) -> list[str]: + """Returns the list representation of primary and foreign key fields. + + Key fields can be represented either as a string (containing one field name) + or a list of strings. + + The input should contain a correctly typed `key_fields` object. + """ + if not isinstance(key_fields, list): + key_fields = [key_fields] + return cast(list[str], key_fields) + + +def _get_unknown_key_fields( + key_fields: list[str], properties: dict[str, Any], resource_path: str = "" +) -> str: + """Return the key fields that don't exist on the specified resource.""" + known_fields = findall(f"{resource_path}schema.fields[*].name", properties) + unknown_fields = _filter(key_fields, lambda field: field not in known_fields) + unknown_fields = _map(unknown_fields, lambda field: f"{field!r}") + return ", ".join(unknown_fields) + + def _set_should_fields_to_required(schema: dict[str, Any]) -> dict[str, Any]: """Set 'SHOULD' fields to 'REQUIRED' in the schema.""" should_fields = ("name", "id", "licenses") diff --git a/tests/test_check.py b/tests/test_check.py index ad1a051f..14388ee3 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -91,6 +91,60 @@ def test_fails_properties_with_pattern_mismatch(): assert issues[0].jsonpath == "$.contributors[0].path" +@mark.parametrize("primary_key", ["id", ["id", "name"]]) +def test_pass_good_primary_key(primary_key): + properties = example_package_properties() + properties["resources"][0]["schema"]["primaryKey"] = primary_key + properties["resources"][0]["schema"]["fields"].extend( + [ + {"name": "id", "type": "integer"}, + {"name": "name", "type": "string"}, + ] + ) + + issues = check(properties) + + assert issues == [] + + +@mark.parametrize("primary_key", ["", "last_name", ["first_name", "last_name"]]) +def test_fail_primary_key_with_unknown_fields(primary_key): + properties = example_package_properties() + properties["resources"][0]["schema"]["primaryKey"] = primary_key + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].jsonpath == "$.resources[0].schema.primaryKey" + assert issues[0].type == "primary-key" + assert issues[0].instance == primary_key + + +@mark.parametrize("primary_key", [None, 123, [], [123, "a_field"]]) +def test_do_not_check_bad_primary_key_against_fields(primary_key): + properties = example_package_properties() + properties["resources"][0]["schema"]["primaryKey"] = primary_key + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].type != "primary-key" + + +def test_do_not_check_primary_key_against_bad_field(): + properties = example_package_properties() + properties["resources"][0]["schema"]["primaryKey"] = "eye-colour" + properties["resources"][0]["schema"]["fields"].append( + # Bad name + {"name": 123, "type": "integer"}, + ) + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].type != "primary-key" + + # "SHOULD" checks @@ -597,16 +651,6 @@ def test_fail_foreign_keys_with_bad_array_item(): ) -@mark.parametrize("primary_key", ["id", ["name", "address"]]) -def test_pass_good_primary_key(primary_key): - properties = example_package_properties() - properties["resources"][0]["schema"]["primaryKey"] = primary_key - - issues = check(properties) - - assert issues == [] - - def test_fail_primary_key_of_bad_type(): properties = example_package_properties() properties["resources"][0]["schema"]["primaryKey"] = 123