From 8c1696867263e36b2823655425ece805f2ac42d4 Mon Sep 17 00:00:00 2001 From: Mateo Date: Thu, 21 May 2026 18:35:32 +0200 Subject: [PATCH 1/4] Refactor validation logic and update JSON schema for improved error handling --- src/vtlengine/API/_InternalApi.py | 176 ++++++++---------- .../API/data/schema/json_schema_2.1.json | 171 +++++++++++------ tests/API/test_api.py | 12 +- 3 files changed, 198 insertions(+), 161 deletions(-) diff --git a/src/vtlengine/API/_InternalApi.py b/src/vtlengine/API/_InternalApi.py index 042ceeffc..c791875e3 100644 --- a/src/vtlengine/API/_InternalApi.py +++ b/src/vtlengine/API/_InternalApi.py @@ -24,7 +24,6 @@ from vtlengine.Exceptions import ( DataLoadError, InputValidationException, - check_key, ) from vtlengine.files.parser import ( _fill_dataset_empty_data, @@ -43,14 +42,10 @@ Dataset, ExternalRoutine, Role, - Role_keys, Scalar, ValueDomain, ) -# Cache SCALAR_TYPES keys for performance -_SCALAR_TYPE_KEYS = SCALAR_TYPES.keys() - base_path = Path(__file__).parent schema_path = base_path / "data" / "schema" sdmx_csv_path = base_path / "data" / "sdmx_csv" @@ -62,105 +57,70 @@ external_routine_schema = json.load(file) -def _extract_data_type(component: Dict[str, Any]) -> Tuple[str, Any]: - """ - Extract and validate data type from component dictionary. +def _get_scalar_type(component: Dict[str, Any]) -> type: + """Return the data type string from a component dict (accepts 'type' or 'data_type').""" + scalar_type = component.get("type") or component["data_type"] + return SCALAR_TYPES[scalar_type] - Supports both 'type' (preferred) and 'data_type' (backward compatibility) keys. - Args: - component: Component dictionary with either 'type' or 'data_type' key +def _build_component(component: Dict[str, Any]) -> VTL_Component: + role = Role("Attribute" if component["role"] == "ViralAttribute" else component["role"]) + nullable = component.get("nullable", role != Role.IDENTIFIER) + return VTL_Component( + name=component["name"], + data_type=_get_scalar_type(component), + role=role, + nullable=nullable, + ) - Returns: - Tuple of (data_type_key, scalar_type_class) - Raises: - InputValidationException: If the data type key or value is invalid - """ - if "type" in component: - key = "type" - value = component["type"] - else: - key = "data_type" - value = component["data_type"] - - check_key(key, _SCALAR_TYPE_KEYS, value) - return key, SCALAR_TYPES[value] +def _resolve_components( + dataset_json: Dict[str, Any], + structures: Dict[str, Any], +) -> List[Dict[str, Any]]: + """Return the inline DataStructure list or resolve it via the dataset's 'structure' ref.""" + if "DataStructure" in dataset_json: + return cast(List[Dict[str, Any]], dataset_json["DataStructure"]) + structure_name = dataset_json["structure"] + for s in structures.get("structures", []): + if s["name"] == structure_name: + return cast(List[Dict[str, Any]], s["components"]) + raise InputValidationException( + code="0-2-1-1", + element=f"DataStructure '{dataset_json['name']}'", + error=f"Referenced structure '{structure_name}' not found", + ) def _load_dataset_from_structure( structures: Dict[str, Any], ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """ - Loads a dataset with the structure given. - """ - datasets = {} - scalars = {} - - if "datasets" in structures: - for dataset_json in structures["datasets"]: - dataset_name = dataset_json["name"] - components = {} - - if "structure" in dataset_json: - structure_name = dataset_json["structure"] - structure_json = None - for s in structures["structures"]: - if s["name"] == structure_name: - structure_json = s - if structure_json is None: - raise InputValidationException(code="0-2-1-2", message="Structure not found.") - try: - jsonschema.validate(instance=structure_json, schema=schema) - except jsonschema.exceptions.ValidationError as e: - raise InputValidationException(code="0-2-1-2", message=e.message) - - for component in structure_json["components"]: - # Support both 'type' and 'data_type' for backward compatibility - _, scalar_type = _extract_data_type(component) - if component["role"] == "ViralAttribute": - component["role"] = "Attribute" - - check_key("role", Role_keys, component["role"]) - - if "nullable" not in component: - if Role(component["role"]) == Role.IDENTIFIER: - component["nullable"] = False - elif Role(component["role"]) in (Role.MEASURE, Role.ATTRIBUTE): - component["nullable"] = True - else: - component["nullable"] = False - - components[component["name"]] = VTL_Component( - name=component["name"], - data_type=scalar_type, - role=Role(component["role"]), - nullable=component["nullable"], - ) - - if "DataStructure" in dataset_json: - for component in dataset_json["DataStructure"]: - # Support both 'type' and 'data_type' for backward compatibility - _, scalar_type = _extract_data_type(component) - check_key("role", Role_keys, component["role"]) - components[component["name"]] = VTL_Component( - name=component["name"], - data_type=scalar_type, - role=Role(component["role"]), - nullable=component["nullable"], - ) - - datasets[dataset_name] = Dataset(name=dataset_name, components=components, data=None) - if "scalars" in structures: - for scalar_json in structures["scalars"]: - scalar_name = scalar_json["name"] - check_key("type", SCALAR_TYPES.keys(), scalar_json["type"]) - scalar = Scalar( - name=scalar_name, - data_type=SCALAR_TYPES[scalar_json["type"]], - value=None, - ) - scalars[scalar_name] = scalar + Loads datasets and scalars from a VTL JSON structure definition. + """ + _validate_json(structures, schema, kind="DataStructures") + + datasets = { + dataset_json["name"]: Dataset( + name=dataset_json["name"], + components={ + c["name"]: _build_component(c) + for c in _resolve_components(dataset_json, structures) + }, + data=None, + ) + for dataset_json in structures.get("datasets", []) + } + + scalars = { + scalar_json["name"]: Scalar( + name=scalar_json["name"], + data_type=_get_scalar_type(scalar_json), + value=None, + ) + for scalar_json in structures.get("scalars", []) + } + return datasets, scalars @@ -598,6 +558,32 @@ def load_vtl(input: Union[str, Path]) -> str: return f.read() +_SECTION_KINDS = { + "datasets": "Dataset", + "scalars": "Scalar", + "structures": "Structure", + "variables": "Variable", + "domains": "Domain", +} + + +def _identify_failing_element( + data: Dict[str, Any], + path: List[Any], + default_kind: str, +) -> Tuple[str, Optional[str]]: + """Return (kind, name) for the failing element pointed to by a ValidationError path.""" + if len(path) < 2 or path[0] not in _SECTION_KINDS or not isinstance(path[1], int): + return default_kind, None + kind = _SECTION_KINDS[path[0]] + try: + element = data[path[0]][path[1]] + except (KeyError, IndexError, TypeError): + return kind, None + name = element.get("name") if isinstance(element, dict) else None + return kind, name + + def _validate_json( data: Dict[str, Any], schema: Dict[str, Any], @@ -607,6 +593,8 @@ def _validate_json( try: jsonschema.validate(instance=data, schema=schema) except jsonschema.ValidationError as e: + if name is None: + kind, name = _identify_failing_element(data, list(e.absolute_path), default_kind=kind) element = f"{kind} '{name}'" if name else f"the provided {kind}" raise InputValidationException(code="0-2-1-1", element=element, error=e.message) diff --git a/src/vtlengine/API/data/schema/json_schema_2.1.json b/src/vtlengine/API/data/schema/json_schema_2.1.json index 8933cf189..688190b13 100644 --- a/src/vtlengine/API/data/schema/json_schema_2.1.json +++ b/src/vtlengine/API/data/schema/json_schema_2.1.json @@ -6,21 +6,56 @@ "type": "string", "pattern": "^[a-zA-Z][a-zA-Z0-9_]*$|^'.*'$" }, - "set-type": { - "type": "array", - "uniqueItems": true, - "oneOf": [ - { "items": { "oneOf": [ { "type": "string" }, { "type": "null" } ] } }, - { "items": { "oneOf": [ { "type": "number" }, { "type": "null" } ] } } + "component-name": { + "type": "string", + "pattern": "^[a-zA-Z][a-zA-Z0-9_]*(#[a-zA-Z][a-zA-Z0-9_]*)?$|^'.*'$" + }, + "data-type": { + "type": "string", + "enum": [ + "String", + "Number", + "Integer", + "Boolean", + "Time", + "Time_Period", + "Date", + "Duration" + ] + }, + "role": { + "type": "string", + "enum": [ + "Identifier", + "Measure", + "Attribute", + "ViralAttribute" ] }, - "identifiable": { + "component": { "type": "object", "properties": { - "name": { "$ref": "#/$defs/vtl-id" }, - "description": { "type": "string" } + "name": {"$ref": "#/$defs/component-name"}, + "role": {"$ref": "#/$defs/role"}, + "nullable": {"type": "boolean"}, + "type": {"$ref": "#/$defs/data-type"}, + "data_type": {"$ref": "#/$defs/data-type"}, + "subset": {"$ref": "#/$defs/vtl-id"}, + "description": {"type": "string"} }, - "required": [ "name" ] + "required": ["name", "role"], + "oneOf": [ + {"required": ["type"]}, + {"required": ["data_type"]} + ] + }, + "set-type": { + "type": "array", + "uniqueItems": true, + "oneOf": [ + {"items": {"oneOf": [{"type": "string"}, {"type": "null"}]}}, + {"items": {"oneOf": [{"type": "number"}, {"type": "null"}]}} + ] } }, "type": "object", @@ -28,83 +63,98 @@ "datasets": { "type": "array", "items": { - "allOf": [ { "$ref": "#/$defs/identifiable" } ], + "type": "object", "properties": { - "source": { "type": "string" }, - "structure": { "$ref": "#/$defs/vtl-id" } + "name": {"$ref": "#/$defs/vtl-id"}, + "description": {"type": "string"}, + "source": {"type": "string"}, + "structure": {"$ref": "#/$defs/vtl-id"}, + "DataStructure": { + "type": "array", + "items": {"$ref": "#/$defs/component"} + } }, - "required": [ "structure" ] + "required": ["name"], + "oneOf": [ + {"required": ["DataStructure"]}, + {"required": ["structure"]} + ] } }, "structures": { "type": "array", "items": { - "allOf": [ { "$ref": "#/$defs/identifiable" } ], + "type": "object", "properties": { + "name": {"$ref": "#/$defs/vtl-id"}, + "description": {"type": "string"}, "components": { "type": "array", - "items": { - "allOf": [ { "$ref": "#/$defs/identifiable" } ], - "properties": { - "role": { - "type": "string", - "enum": [ "Identifier", "Measure", "Attribute", "Viral Attribute" ] - }, - "subset": { "$ref": "#/$defs/vtl-id" }, - "nullable": { "type": "boolean" }, - "data_type": { - "type": "string", - "enum": [ "String", "Number", "Integer", "Boolean", "Time", "TimePeriod", "Date", "Duration" ] - } - }, - "required": [ "role" ] - } + "items": {"$ref": "#/$defs/component"} } }, - "required": [ "components" ] + "required": ["name", "components"] + } + }, + "scalars": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"$ref": "#/$defs/vtl-id"}, + "description": {"type": "string"}, + "type": {"$ref": "#/$defs/data-type"}, + "data_type": {"$ref": "#/$defs/data-type"} + }, + "required": ["name"], + "oneOf": [ + {"required": ["type"]}, + {"required": ["data_type"]} + ] } }, "variables": { "type": "array", "items": { - "allOf": [ { "$ref": "#/$defs/identifiable" } ], + "type": "object", "properties": { - "domain": { "$ref": "#/$defs/vtl-id" } + "name": {"$ref": "#/$defs/vtl-id"}, + "description": {"type": "string"}, + "domain": {"$ref": "#/$defs/vtl-id"} }, - "required": [ "domain" ] + "required": ["name", "domain"] } }, "domains": { "type": "array", "items": { - "allOf": [ { "$ref": "#/$defs/identifiable" } ], + "type": "object", + "properties": { + "name": {"$ref": "#/$defs/vtl-id"}, + "description": {"type": "string"} + }, + "required": ["name"], "unevaluatedProperties": false, "oneOf": [ { - "properties": { - "externalRef": { "type": "string" } - }, - "required": [ "externalRef" ] - }, { - "properties": { - "parent": { "$ref": "#/$defs/vtl-id" } - }, - "required": [ "parent" ], - "oneOf": [{ - "properties": { - "restriction": { "$ref": "#/$defs/set-type" } - }, - "required": [ "restriction" ] - }, { - "properties": { - "enumerated": { "$ref": "#/$defs/set-type" } - }, - "required": [ "enumerated" ] - }, { - "properties": { - "described": { "type": "string" } - }, - "required": [ "described" ] + "properties": {"externalRef": {"type": "string"}}, + "required": ["externalRef"] + }, + { + "properties": {"parent": {"$ref": "#/$defs/vtl-id"}}, + "required": ["parent"], + "oneOf": [ + { + "properties": {"restriction": {"$ref": "#/$defs/set-type"}}, + "required": ["restriction"] + }, + { + "properties": {"enumerated": {"$ref": "#/$defs/set-type"}}, + "required": ["enumerated"] + }, + { + "properties": {"described": {"type": "string"}}, + "required": ["described"] } ] } @@ -113,4 +163,3 @@ } } } - diff --git a/tests/API/test_api.py b/tests/API/test_api.py index bd52efc9b..3609579af 100644 --- a/tests/API/test_api.py +++ b/tests/API/test_api.py @@ -344,11 +344,11 @@ param_id_null = [((filepath_json / "DS_ID_null.json"), "Identifier Id_1 cannot be nullable")] -param_wrong_role = [((filepath_json / "DS_Role_wrong.json"), "0-1-1-13")] +param_wrong_role = [((filepath_json / "DS_Role_wrong.json"), "0-2-1-1")] -param_wrong_data_type = [((filepath_json / "DS_wrong_datatype.json"), "0-1-1-13")] +param_wrong_data_type = [((filepath_json / "DS_wrong_datatype.json"), "0-2-1-1")] -param_viral_attr = [((filepath_json / "DS_Viral_attr.json"), "0-1-1-13")] +param_viral_attr = [((filepath_json / "DS_Viral_attr.json"), "0-2-1-1")] params_check_script = [ ( @@ -1533,13 +1533,13 @@ def test_load_data_structure_with_null_id(ds_r, error_message): @pytest.mark.parametrize("ds_r, error_code", param_wrong_role) def test_load_data_structure_with_wrong_role(ds_r, error_code): - with pytest.raises(SemanticError, match=error_code): + with pytest.raises(InputValidationException, match=error_code): load_datasets(ds_r) @pytest.mark.parametrize("ds_r, error_code", param_wrong_data_type) def test_load_data_structure_with_wrong_data_type(ds_r, error_code): - with pytest.raises(SemanticError, match=error_code): + with pytest.raises(InputValidationException, match=error_code): load_datasets(ds_r) @@ -1768,7 +1768,7 @@ def test_wrong_type_in_scalar_definition(wrong_type, correct_type): ] } - with pytest.raises(SemanticError, match="0-1-1-13") as e: + with pytest.raises(InputValidationException, match="0-2-1-1") as e: run( script=script, data_structures=data_structures, From 989169d2c55c9b8e13a7dbd4adcdd5ed9977eee5 Mon Sep 17 00:00:00 2001 From: Mateo Date: Fri, 22 May 2026 10:30:26 +0200 Subject: [PATCH 2/4] Minor fix --- src/vtlengine/API/_InternalApi.py | 86 ++++++++++++++++++------------- 1 file changed, 51 insertions(+), 35 deletions(-) diff --git a/src/vtlengine/API/_InternalApi.py b/src/vtlengine/API/_InternalApi.py index c791875e3..28562c450 100644 --- a/src/vtlengine/API/_InternalApi.py +++ b/src/vtlengine/API/_InternalApi.py @@ -24,6 +24,7 @@ from vtlengine.Exceptions import ( DataLoadError, InputValidationException, + check_key, ) from vtlengine.files.parser import ( _fill_dataset_empty_data, @@ -46,6 +47,9 @@ ValueDomain, ) +# Cache SCALAR_TYPES keys for performance +_SCALAR_TYPE_KEYS = SCALAR_TYPES.keys() + base_path = Path(__file__).parent schema_path = base_path / "data" / "schema" sdmx_csv_path = base_path / "data" / "sdmx_csv" @@ -57,18 +61,34 @@ external_routine_schema = json.load(file) -def _get_scalar_type(component: Dict[str, Any]) -> type: - """Return the data type string from a component dict (accepts 'type' or 'data_type').""" - scalar_type = component.get("type") or component["data_type"] - return SCALAR_TYPES[scalar_type] +def _extract_data_type(component: Dict[str, Any]) -> Tuple[str, Any]: + """ + Extract and validate data type from component dictionary. + + Supports both 'type' (preferred) and 'data_type' (backward compatibility) keys. + + Args: + component: Component dictionary with either 'type' or 'data_type' key + + Returns: + Tuple of (data_type_key, scalar_type_class) + + Raises: + InputValidationException: If the data type key or value is invalid + """ + key = "type" if "type" in component else "data_type" + value = component[key] + check_key(key, _SCALAR_TYPE_KEYS, value) + return key, SCALAR_TYPES[value] def _build_component(component: Dict[str, Any]) -> VTL_Component: role = Role("Attribute" if component["role"] == "ViralAttribute" else component["role"]) nullable = component.get("nullable", role != Role.IDENTIFIER) + _, scalar_type = _extract_data_type(component) return VTL_Component( name=component["name"], - data_type=_get_scalar_type(component), + data_type=scalar_type, role=role, nullable=nullable, ) @@ -92,13 +112,37 @@ def _resolve_components( ) +_DATASTRUCTURE_SECTION_KINDS = { + "datasets": "Dataset", + "scalars": "Scalar", + "structures": "Structure", +} + + +def _validate_datastructures(structures: Dict[str, Any]) -> None: + """Validate against the DataStructure schema, pointing to the failing dataset/scalar.""" + try: + jsonschema.validate(instance=structures, schema=schema) + except jsonschema.ValidationError as e: + path = list(e.absolute_path) + kind: str = "DataStructures" + name: Optional[str] = None + if len(path) >= 2 and path[0] in _DATASTRUCTURE_SECTION_KINDS and isinstance(path[1], int): + kind = _DATASTRUCTURE_SECTION_KINDS[path[0]] + element = structures[path[0]][path[1]] + if isinstance(element, dict): + name = element.get("name") + identifier = f"{kind} '{name}'" if name else f"the provided {kind}" + raise InputValidationException(code="0-2-1-1", element=identifier, error=e.message) + + def _load_dataset_from_structure( structures: Dict[str, Any], ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """ Loads datasets and scalars from a VTL JSON structure definition. """ - _validate_json(structures, schema, kind="DataStructures") + _validate_datastructures(structures) datasets = { dataset_json["name"]: Dataset( @@ -115,7 +159,7 @@ def _load_dataset_from_structure( scalars = { scalar_json["name"]: Scalar( name=scalar_json["name"], - data_type=_get_scalar_type(scalar_json), + data_type=_extract_data_type(scalar_json)[1], value=None, ) for scalar_json in structures.get("scalars", []) @@ -558,32 +602,6 @@ def load_vtl(input: Union[str, Path]) -> str: return f.read() -_SECTION_KINDS = { - "datasets": "Dataset", - "scalars": "Scalar", - "structures": "Structure", - "variables": "Variable", - "domains": "Domain", -} - - -def _identify_failing_element( - data: Dict[str, Any], - path: List[Any], - default_kind: str, -) -> Tuple[str, Optional[str]]: - """Return (kind, name) for the failing element pointed to by a ValidationError path.""" - if len(path) < 2 or path[0] not in _SECTION_KINDS or not isinstance(path[1], int): - return default_kind, None - kind = _SECTION_KINDS[path[0]] - try: - element = data[path[0]][path[1]] - except (KeyError, IndexError, TypeError): - return kind, None - name = element.get("name") if isinstance(element, dict) else None - return kind, name - - def _validate_json( data: Dict[str, Any], schema: Dict[str, Any], @@ -593,8 +611,6 @@ def _validate_json( try: jsonschema.validate(instance=data, schema=schema) except jsonschema.ValidationError as e: - if name is None: - kind, name = _identify_failing_element(data, list(e.absolute_path), default_kind=kind) element = f"{kind} '{name}'" if name else f"the provided {kind}" raise InputValidationException(code="0-2-1-1", element=element, error=e.message) From 26ec0960e490e3ac4ceeae3ec008f53c22c9afc1 Mon Sep 17 00:00:00 2001 From: Mateo Date: Fri, 22 May 2026 10:36:25 +0200 Subject: [PATCH 3/4] Refactor validation logic: replace _validate_datastructures with _validate_json for improved error handling --- src/vtlengine/API/_InternalApi.py | 48 ++++++++++++++----------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/src/vtlengine/API/_InternalApi.py b/src/vtlengine/API/_InternalApi.py index 28562c450..3d4047a69 100644 --- a/src/vtlengine/API/_InternalApi.py +++ b/src/vtlengine/API/_InternalApi.py @@ -112,37 +112,13 @@ def _resolve_components( ) -_DATASTRUCTURE_SECTION_KINDS = { - "datasets": "Dataset", - "scalars": "Scalar", - "structures": "Structure", -} - - -def _validate_datastructures(structures: Dict[str, Any]) -> None: - """Validate against the DataStructure schema, pointing to the failing dataset/scalar.""" - try: - jsonschema.validate(instance=structures, schema=schema) - except jsonschema.ValidationError as e: - path = list(e.absolute_path) - kind: str = "DataStructures" - name: Optional[str] = None - if len(path) >= 2 and path[0] in _DATASTRUCTURE_SECTION_KINDS and isinstance(path[1], int): - kind = _DATASTRUCTURE_SECTION_KINDS[path[0]] - element = structures[path[0]][path[1]] - if isinstance(element, dict): - name = element.get("name") - identifier = f"{kind} '{name}'" if name else f"the provided {kind}" - raise InputValidationException(code="0-2-1-1", element=identifier, error=e.message) - - def _load_dataset_from_structure( structures: Dict[str, Any], ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """ Loads datasets and scalars from a VTL JSON structure definition. """ - _validate_datastructures(structures) + _validate_json(structures, schema, kind="DataStructures") datasets = { dataset_json["name"]: Dataset( @@ -602,6 +578,13 @@ def load_vtl(input: Union[str, Path]) -> str: return f.read() +_SECTION_KINDS = { + "datasets": "Dataset", + "scalars": "Scalar", + "structures": "Structure", +} + + def _validate_json( data: Dict[str, Any], schema: Dict[str, Any], @@ -611,8 +594,19 @@ def _validate_json( try: jsonschema.validate(instance=data, schema=schema) except jsonschema.ValidationError as e: - element = f"{kind} '{name}'" if name else f"the provided {kind}" - raise InputValidationException(code="0-2-1-1", element=element, error=e.message) + if name is None: + path = list(e.absolute_path) + if ( + len(path) >= 2 + and path[0] in _SECTION_KINDS + and isinstance(path[1], int) + ): + kind = _SECTION_KINDS[path[0]] + element = data[path[0]][path[1]] + if isinstance(element, dict): + name = element.get("name") + identifier = f"{kind} '{name}'" if name else f"the provided {kind}" + raise InputValidationException(code="0-2-1-1", element=identifier, error=e.message) def _load_single_value_domain(input: Path) -> Dict[str, ValueDomain]: From 42174d306caf128456f39343dc6c5124585d72a5 Mon Sep 17 00:00:00 2001 From: Mateo Date: Fri, 22 May 2026 10:55:25 +0200 Subject: [PATCH 4/4] Added related tests --- src/vtlengine/API/_InternalApi.py | 6 +- .../API/data/schema/json_schema_2.1.json | 1 + tests/API/test_api.py | 210 ++++++++++++++++++ 3 files changed, 212 insertions(+), 5 deletions(-) diff --git a/src/vtlengine/API/_InternalApi.py b/src/vtlengine/API/_InternalApi.py index 3d4047a69..5f751cf44 100644 --- a/src/vtlengine/API/_InternalApi.py +++ b/src/vtlengine/API/_InternalApi.py @@ -596,11 +596,7 @@ def _validate_json( except jsonschema.ValidationError as e: if name is None: path = list(e.absolute_path) - if ( - len(path) >= 2 - and path[0] in _SECTION_KINDS - and isinstance(path[1], int) - ): + if len(path) >= 2 and path[0] in _SECTION_KINDS and isinstance(path[1], int): kind = _SECTION_KINDS[path[0]] element = data[path[0]][path[1]] if isinstance(element, dict): diff --git a/src/vtlengine/API/data/schema/json_schema_2.1.json b/src/vtlengine/API/data/schema/json_schema_2.1.json index 688190b13..49befe6b0 100644 --- a/src/vtlengine/API/data/schema/json_schema_2.1.json +++ b/src/vtlengine/API/data/schema/json_schema_2.1.json @@ -59,6 +59,7 @@ } }, "type": "object", + "additionalProperties": false, "properties": { "datasets": { "type": "array", diff --git a/tests/API/test_api.py b/tests/API/test_api.py index 3609579af..19f31dc8b 100644 --- a/tests/API/test_api.py +++ b/tests/API/test_api.py @@ -2128,3 +2128,213 @@ def test_run_error_on_missing_non_nullable_column(): with pytest.raises(DataLoadError, match="0-3-1-5"): run(script=script, data_structures=data_structures, datapoints=datapoints) + + +schema_validation_cases = [ + # Unknown top-level property + ( + {"unknown_key": []}, + "Additional properties are not allowed", + ), + # Dataset missing 'name' + ( + { + "datasets": [ + {"DataStructure": [{"name": "Id_1", "type": "Integer", "role": "Identifier"}]} + ] + }, + "'name' is a required property", + ), + # Dataset has neither DataStructure nor structure + ( + {"datasets": [{"name": "DS_1"}]}, + "is not valid under any of the given schemas", + ), + # Dataset has both DataStructure and structure (oneOf violation) + ( + { + "datasets": [ + { + "name": "DS_1", + "DataStructure": [{"name": "Id_1", "type": "Integer", "role": "Identifier"}], + "structure": "Struct_1", + } + ], + "structures": [ + { + "name": "Struct_1", + "components": [{"name": "Id_1", "data_type": "Integer", "role": "Identifier"}], + } + ], + }, + "is valid under each of", + ), + # Component missing 'role' + ( + {"datasets": [{"name": "DS_1", "DataStructure": [{"name": "Id_1", "type": "Integer"}]}]}, + "'role' is a required property", + ), + # Component missing both 'type' and 'data_type' + ( + {"datasets": [{"name": "DS_1", "DataStructure": [{"name": "Id_1", "role": "Identifier"}]}]}, + "is not valid under any of the given schemas", + ), + # Component with both 'type' and 'data_type' + ( + { + "datasets": [ + { + "name": "DS_1", + "DataStructure": [ + { + "name": "Id_1", + "type": "Integer", + "data_type": "Integer", + "role": "Identifier", + } + ], + } + ] + }, + "is valid under each of", + ), + # Component with invalid role enum + ( + { + "datasets": [ + { + "name": "DS_1", + "DataStructure": [{"name": "Id_1", "type": "Integer", "role": "BadRole"}], + } + ] + }, + "'BadRole' is not one of", + ), + # Component with invalid data type enum + ( + { + "datasets": [ + { + "name": "DS_1", + "DataStructure": [{"name": "Id_1", "type": "BadType", "role": "Identifier"}], + } + ] + }, + "'BadType' is not one of", + ), + # Component with nullable as non-boolean + ( + { + "datasets": [ + { + "name": "DS_1", + "DataStructure": [ + { + "name": "Id_1", + "type": "Integer", + "role": "Identifier", + "nullable": "yes", + } + ], + } + ] + }, + "is not of type 'boolean'", + ), + # Dataset name does not match the vtl-id pattern (starts with digit) + ( + { + "datasets": [ + { + "name": "1bad", + "DataStructure": [{"name": "Id_1", "type": "Integer", "role": "Identifier"}], + } + ] + }, + "does not match", + ), + # Scalar missing both 'type' and 'data_type' + ( + {"scalars": [{"name": "sc_1"}]}, + "is not valid under any of the given schemas", + ), + # Scalar with invalid type enum + ( + {"scalars": [{"name": "sc_1", "type": "BadType"}]}, + "'BadType' is not one of", + ), + # Structure missing 'components' + ( + { + "datasets": [{"name": "DS_1", "structure": "Struct_1"}], + "structures": [{"name": "Struct_1"}], + }, + "'components' is a required property", + ), +] + + +@pytest.mark.parametrize("data_structures, expected_fragment", schema_validation_cases) +def test_data_structure_schema_validation(data_structures, expected_fragment): + """Schema validation rejects malformed data_structures with informative messages.""" + with pytest.raises(InputValidationException, match="0-2-1-1") as excinfo: + load_datasets(data_structures) + assert expected_fragment in excinfo.value.args[0] + + +schema_failing_element_cases = [ + # Dataset error -> message includes "Dataset ''" + ( + { + "datasets": [ + { + "name": "DS_named", + "DataStructure": [{"name": "Id_1", "type": "Integer"}], + } + ] + }, + "Dataset 'DS_named'", + ), + # Scalar error -> message includes "Scalar ''" + ( + {"scalars": [{"name": "sc_named", "type": "BadType"}]}, + "Scalar 'sc_named'", + ), + # Structure error -> message includes "Structure ''" + ( + { + "datasets": [{"name": "DS_1", "structure": "Struct_named"}], + "structures": [ + { + "name": "Struct_named", + "components": [{"name": "Id_1", "role": "BadRole", "data_type": "Integer"}], + } + ], + }, + "Structure 'Struct_named'", + ), +] + + +@pytest.mark.parametrize("data_structures, expected_fragment", schema_failing_element_cases) +def test_data_structure_schema_error_identifies_element(data_structures, expected_fragment): + """The validation error message points to the specific dataset/scalar/structure that failed.""" + with pytest.raises(InputValidationException, match="0-2-1-1") as excinfo: + load_datasets(data_structures) + assert expected_fragment in excinfo.value.args[0] + + +def test_referenced_structure_not_found(): + """A dataset that references a non-existent structure raises a clear error.""" + data_structures = { + "datasets": [{"name": "DS_1", "structure": "Missing"}], + "structures": [ + { + "name": "Other", + "components": [{"name": "Id_1", "data_type": "Integer", "role": "Identifier"}], + } + ], + } + with pytest.raises(InputValidationException, match="0-2-1-1") as excinfo: + load_datasets(data_structures) + assert "Missing" in excinfo.value.args[0]