diff --git a/src/vtlengine/API/_InternalApi.py b/src/vtlengine/API/_InternalApi.py index 042ceeffc..5f751cf44 100644 --- a/src/vtlengine/API/_InternalApi.py +++ b/src/vtlengine/API/_InternalApi.py @@ -43,7 +43,6 @@ Dataset, ExternalRoutine, Role, - Role_keys, Scalar, ValueDomain, ) @@ -77,90 +76,71 @@ def _extract_data_type(component: Dict[str, Any]) -> Tuple[str, Any]: Raises: InputValidationException: If the data type key or value is invalid """ - if "type" in component: - key = "type" - value = component["type"] - else: - key = "data_type" - value = component["data_type"] - + key = "type" if "type" in component else "data_type" + value = component[key] check_key(key, _SCALAR_TYPE_KEYS, value) return key, SCALAR_TYPES[value] +def _build_component(component: Dict[str, Any]) -> VTL_Component: + role = Role("Attribute" if component["role"] == "ViralAttribute" else component["role"]) + nullable = component.get("nullable", role != Role.IDENTIFIER) + _, scalar_type = _extract_data_type(component) + return VTL_Component( + name=component["name"], + data_type=scalar_type, + role=role, + nullable=nullable, + ) + + +def _resolve_components( + dataset_json: Dict[str, Any], + structures: Dict[str, Any], +) -> List[Dict[str, Any]]: + """Return the inline DataStructure list or resolve it via the dataset's 'structure' ref.""" + if "DataStructure" in dataset_json: + return cast(List[Dict[str, Any]], dataset_json["DataStructure"]) + structure_name = dataset_json["structure"] + for s in structures.get("structures", []): + if s["name"] == structure_name: + return cast(List[Dict[str, Any]], s["components"]) + raise InputValidationException( + code="0-2-1-1", + element=f"DataStructure '{dataset_json['name']}'", + error=f"Referenced structure '{structure_name}' not found", + ) + + def _load_dataset_from_structure( structures: Dict[str, Any], ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """ - Loads a dataset with the structure given. - """ - datasets = {} - scalars = {} - - if "datasets" in structures: - for dataset_json in structures["datasets"]: - dataset_name = dataset_json["name"] - components = {} - - if "structure" in dataset_json: - structure_name = dataset_json["structure"] - structure_json = None - for s in structures["structures"]: - if s["name"] == structure_name: - structure_json = s - if structure_json is None: - raise InputValidationException(code="0-2-1-2", message="Structure not found.") - try: - jsonschema.validate(instance=structure_json, schema=schema) - except jsonschema.exceptions.ValidationError as e: - raise InputValidationException(code="0-2-1-2", message=e.message) - - for component in structure_json["components"]: - # Support both 'type' and 'data_type' for backward compatibility - _, scalar_type = _extract_data_type(component) - if component["role"] == "ViralAttribute": - component["role"] = "Attribute" - - check_key("role", Role_keys, component["role"]) - - if "nullable" not in component: - if Role(component["role"]) == Role.IDENTIFIER: - component["nullable"] = False - elif Role(component["role"]) in (Role.MEASURE, Role.ATTRIBUTE): - component["nullable"] = True - else: - component["nullable"] = False - - components[component["name"]] = VTL_Component( - name=component["name"], - data_type=scalar_type, - role=Role(component["role"]), - nullable=component["nullable"], - ) - - if "DataStructure" in dataset_json: - for component in dataset_json["DataStructure"]: - # Support both 'type' and 'data_type' for backward compatibility - _, scalar_type = _extract_data_type(component) - check_key("role", Role_keys, component["role"]) - components[component["name"]] = VTL_Component( - name=component["name"], - data_type=scalar_type, - role=Role(component["role"]), - nullable=component["nullable"], - ) - - datasets[dataset_name] = Dataset(name=dataset_name, components=components, data=None) - if "scalars" in structures: - for scalar_json in structures["scalars"]: - scalar_name = scalar_json["name"] - check_key("type", SCALAR_TYPES.keys(), scalar_json["type"]) - scalar = Scalar( - name=scalar_name, - data_type=SCALAR_TYPES[scalar_json["type"]], - value=None, - ) - scalars[scalar_name] = scalar + Loads datasets and scalars from a VTL JSON structure definition. + """ + _validate_json(structures, schema, kind="DataStructures") + + datasets = { + dataset_json["name"]: Dataset( + name=dataset_json["name"], + components={ + c["name"]: _build_component(c) + for c in _resolve_components(dataset_json, structures) + }, + data=None, + ) + for dataset_json in structures.get("datasets", []) + } + + scalars = { + scalar_json["name"]: Scalar( + name=scalar_json["name"], + data_type=_extract_data_type(scalar_json)[1], + value=None, + ) + for scalar_json in structures.get("scalars", []) + } + return datasets, scalars @@ -598,6 +578,13 @@ def load_vtl(input: Union[str, Path]) -> str: return f.read() +_SECTION_KINDS = { + "datasets": "Dataset", + "scalars": "Scalar", + "structures": "Structure", +} + + def _validate_json( data: Dict[str, Any], schema: Dict[str, Any], @@ -607,8 +594,15 @@ def _validate_json( try: jsonschema.validate(instance=data, schema=schema) except jsonschema.ValidationError as e: - element = f"{kind} '{name}'" if name else f"the provided {kind}" - raise InputValidationException(code="0-2-1-1", element=element, error=e.message) + if name is None: + path = list(e.absolute_path) + if len(path) >= 2 and path[0] in _SECTION_KINDS and isinstance(path[1], int): + kind = _SECTION_KINDS[path[0]] + element = data[path[0]][path[1]] + if isinstance(element, dict): + name = element.get("name") + identifier = f"{kind} '{name}'" if name else f"the provided {kind}" + raise InputValidationException(code="0-2-1-1", element=identifier, error=e.message) def _load_single_value_domain(input: Path) -> Dict[str, ValueDomain]: diff --git a/src/vtlengine/API/data/schema/json_schema_2.1.json b/src/vtlengine/API/data/schema/json_schema_2.1.json index 8933cf189..49befe6b0 100644 --- a/src/vtlengine/API/data/schema/json_schema_2.1.json +++ b/src/vtlengine/API/data/schema/json_schema_2.1.json @@ -6,105 +6,156 @@ "type": "string", "pattern": "^[a-zA-Z][a-zA-Z0-9_]*$|^'.*'$" }, - "set-type": { - "type": "array", - "uniqueItems": true, - "oneOf": [ - { "items": { "oneOf": [ { "type": "string" }, { "type": "null" } ] } }, - { "items": { "oneOf": [ { "type": "number" }, { "type": "null" } ] } } + "component-name": { + "type": "string", + "pattern": "^[a-zA-Z][a-zA-Z0-9_]*(#[a-zA-Z][a-zA-Z0-9_]*)?$|^'.*'$" + }, + "data-type": { + "type": "string", + "enum": [ + "String", + "Number", + "Integer", + "Boolean", + "Time", + "Time_Period", + "Date", + "Duration" + ] + }, + "role": { + "type": "string", + "enum": [ + "Identifier", + "Measure", + "Attribute", + "ViralAttribute" ] }, - "identifiable": { + "component": { "type": "object", "properties": { - "name": { "$ref": "#/$defs/vtl-id" }, - "description": { "type": "string" } + "name": {"$ref": "#/$defs/component-name"}, + "role": {"$ref": "#/$defs/role"}, + "nullable": {"type": "boolean"}, + "type": {"$ref": "#/$defs/data-type"}, + "data_type": {"$ref": "#/$defs/data-type"}, + "subset": {"$ref": "#/$defs/vtl-id"}, + "description": {"type": "string"} }, - "required": [ "name" ] + "required": ["name", "role"], + "oneOf": [ + {"required": ["type"]}, + {"required": ["data_type"]} + ] + }, + "set-type": { + "type": "array", + "uniqueItems": true, + "oneOf": [ + {"items": {"oneOf": [{"type": "string"}, {"type": "null"}]}}, + {"items": {"oneOf": [{"type": "number"}, {"type": "null"}]}} + ] } }, "type": "object", + "additionalProperties": false, "properties": { "datasets": { "type": "array", "items": { - "allOf": [ { "$ref": "#/$defs/identifiable" } ], + "type": "object", "properties": { - "source": { "type": "string" }, - "structure": { "$ref": "#/$defs/vtl-id" } + "name": {"$ref": "#/$defs/vtl-id"}, + "description": {"type": "string"}, + "source": {"type": "string"}, + "structure": {"$ref": "#/$defs/vtl-id"}, + "DataStructure": { + "type": "array", + "items": {"$ref": "#/$defs/component"} + } }, - "required": [ "structure" ] + "required": ["name"], + "oneOf": [ + {"required": ["DataStructure"]}, + {"required": ["structure"]} + ] } }, "structures": { "type": "array", "items": { - "allOf": [ { "$ref": "#/$defs/identifiable" } ], + "type": "object", "properties": { + "name": {"$ref": "#/$defs/vtl-id"}, + "description": {"type": "string"}, "components": { "type": "array", - "items": { - "allOf": [ { "$ref": "#/$defs/identifiable" } ], - "properties": { - "role": { - "type": "string", - "enum": [ "Identifier", "Measure", "Attribute", "Viral Attribute" ] - }, - "subset": { "$ref": "#/$defs/vtl-id" }, - "nullable": { "type": "boolean" }, - "data_type": { - "type": "string", - "enum": [ "String", "Number", "Integer", "Boolean", "Time", "TimePeriod", "Date", "Duration" ] - } - }, - "required": [ "role" ] - } + "items": {"$ref": "#/$defs/component"} } }, - "required": [ "components" ] + "required": ["name", "components"] + } + }, + "scalars": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"$ref": "#/$defs/vtl-id"}, + "description": {"type": "string"}, + "type": {"$ref": "#/$defs/data-type"}, + "data_type": {"$ref": "#/$defs/data-type"} + }, + "required": ["name"], + "oneOf": [ + {"required": ["type"]}, + {"required": ["data_type"]} + ] } }, "variables": { "type": "array", "items": { - "allOf": [ { "$ref": "#/$defs/identifiable" } ], + "type": "object", "properties": { - "domain": { "$ref": "#/$defs/vtl-id" } + "name": {"$ref": "#/$defs/vtl-id"}, + "description": {"type": "string"}, + "domain": {"$ref": "#/$defs/vtl-id"} }, - "required": [ "domain" ] + "required": ["name", "domain"] } }, "domains": { "type": "array", "items": { - "allOf": [ { "$ref": "#/$defs/identifiable" } ], + "type": "object", + "properties": { + "name": {"$ref": "#/$defs/vtl-id"}, + "description": {"type": "string"} + }, + "required": ["name"], "unevaluatedProperties": false, "oneOf": [ { - "properties": { - "externalRef": { "type": "string" } - }, - "required": [ "externalRef" ] - }, { - "properties": { - "parent": { "$ref": "#/$defs/vtl-id" } - }, - "required": [ "parent" ], - "oneOf": [{ - "properties": { - "restriction": { "$ref": "#/$defs/set-type" } - }, - "required": [ "restriction" ] - }, { - "properties": { - "enumerated": { "$ref": "#/$defs/set-type" } - }, - "required": [ "enumerated" ] - }, { - "properties": { - "described": { "type": "string" } - }, - "required": [ "described" ] + "properties": {"externalRef": {"type": "string"}}, + "required": ["externalRef"] + }, + { + "properties": {"parent": {"$ref": "#/$defs/vtl-id"}}, + "required": ["parent"], + "oneOf": [ + { + "properties": {"restriction": {"$ref": "#/$defs/set-type"}}, + "required": ["restriction"] + }, + { + "properties": {"enumerated": {"$ref": "#/$defs/set-type"}}, + "required": ["enumerated"] + }, + { + "properties": {"described": {"type": "string"}}, + "required": ["described"] } ] } @@ -113,4 +164,3 @@ } } } - diff --git a/tests/API/test_api.py b/tests/API/test_api.py index bd52efc9b..19f31dc8b 100644 --- a/tests/API/test_api.py +++ b/tests/API/test_api.py @@ -344,11 +344,11 @@ param_id_null = [((filepath_json / "DS_ID_null.json"), "Identifier Id_1 cannot be nullable")] -param_wrong_role = [((filepath_json / "DS_Role_wrong.json"), "0-1-1-13")] +param_wrong_role = [((filepath_json / "DS_Role_wrong.json"), "0-2-1-1")] -param_wrong_data_type = [((filepath_json / "DS_wrong_datatype.json"), "0-1-1-13")] +param_wrong_data_type = [((filepath_json / "DS_wrong_datatype.json"), "0-2-1-1")] -param_viral_attr = [((filepath_json / "DS_Viral_attr.json"), "0-1-1-13")] +param_viral_attr = [((filepath_json / "DS_Viral_attr.json"), "0-2-1-1")] params_check_script = [ ( @@ -1533,13 +1533,13 @@ def test_load_data_structure_with_null_id(ds_r, error_message): @pytest.mark.parametrize("ds_r, error_code", param_wrong_role) def test_load_data_structure_with_wrong_role(ds_r, error_code): - with pytest.raises(SemanticError, match=error_code): + with pytest.raises(InputValidationException, match=error_code): load_datasets(ds_r) @pytest.mark.parametrize("ds_r, error_code", param_wrong_data_type) def test_load_data_structure_with_wrong_data_type(ds_r, error_code): - with pytest.raises(SemanticError, match=error_code): + with pytest.raises(InputValidationException, match=error_code): load_datasets(ds_r) @@ -1768,7 +1768,7 @@ def test_wrong_type_in_scalar_definition(wrong_type, correct_type): ] } - with pytest.raises(SemanticError, match="0-1-1-13") as e: + with pytest.raises(InputValidationException, match="0-2-1-1") as e: run( script=script, data_structures=data_structures, @@ -2128,3 +2128,213 @@ def test_run_error_on_missing_non_nullable_column(): with pytest.raises(DataLoadError, match="0-3-1-5"): run(script=script, data_structures=data_structures, datapoints=datapoints) + + +schema_validation_cases = [ + # Unknown top-level property + ( + {"unknown_key": []}, + "Additional properties are not allowed", + ), + # Dataset missing 'name' + ( + { + "datasets": [ + {"DataStructure": [{"name": "Id_1", "type": "Integer", "role": "Identifier"}]} + ] + }, + "'name' is a required property", + ), + # Dataset has neither DataStructure nor structure + ( + {"datasets": [{"name": "DS_1"}]}, + "is not valid under any of the given schemas", + ), + # Dataset has both DataStructure and structure (oneOf violation) + ( + { + "datasets": [ + { + "name": "DS_1", + "DataStructure": [{"name": "Id_1", "type": "Integer", "role": "Identifier"}], + "structure": "Struct_1", + } + ], + "structures": [ + { + "name": "Struct_1", + "components": [{"name": "Id_1", "data_type": "Integer", "role": "Identifier"}], + } + ], + }, + "is valid under each of", + ), + # Component missing 'role' + ( + {"datasets": [{"name": "DS_1", "DataStructure": [{"name": "Id_1", "type": "Integer"}]}]}, + "'role' is a required property", + ), + # Component missing both 'type' and 'data_type' + ( + {"datasets": [{"name": "DS_1", "DataStructure": [{"name": "Id_1", "role": "Identifier"}]}]}, + "is not valid under any of the given schemas", + ), + # Component with both 'type' and 'data_type' + ( + { + "datasets": [ + { + "name": "DS_1", + "DataStructure": [ + { + "name": "Id_1", + "type": "Integer", + "data_type": "Integer", + "role": "Identifier", + } + ], + } + ] + }, + "is valid under each of", + ), + # Component with invalid role enum + ( + { + "datasets": [ + { + "name": "DS_1", + "DataStructure": [{"name": "Id_1", "type": "Integer", "role": "BadRole"}], + } + ] + }, + "'BadRole' is not one of", + ), + # Component with invalid data type enum + ( + { + "datasets": [ + { + "name": "DS_1", + "DataStructure": [{"name": "Id_1", "type": "BadType", "role": "Identifier"}], + } + ] + }, + "'BadType' is not one of", + ), + # Component with nullable as non-boolean + ( + { + "datasets": [ + { + "name": "DS_1", + "DataStructure": [ + { + "name": "Id_1", + "type": "Integer", + "role": "Identifier", + "nullable": "yes", + } + ], + } + ] + }, + "is not of type 'boolean'", + ), + # Dataset name does not match the vtl-id pattern (starts with digit) + ( + { + "datasets": [ + { + "name": "1bad", + "DataStructure": [{"name": "Id_1", "type": "Integer", "role": "Identifier"}], + } + ] + }, + "does not match", + ), + # Scalar missing both 'type' and 'data_type' + ( + {"scalars": [{"name": "sc_1"}]}, + "is not valid under any of the given schemas", + ), + # Scalar with invalid type enum + ( + {"scalars": [{"name": "sc_1", "type": "BadType"}]}, + "'BadType' is not one of", + ), + # Structure missing 'components' + ( + { + "datasets": [{"name": "DS_1", "structure": "Struct_1"}], + "structures": [{"name": "Struct_1"}], + }, + "'components' is a required property", + ), +] + + +@pytest.mark.parametrize("data_structures, expected_fragment", schema_validation_cases) +def test_data_structure_schema_validation(data_structures, expected_fragment): + """Schema validation rejects malformed data_structures with informative messages.""" + with pytest.raises(InputValidationException, match="0-2-1-1") as excinfo: + load_datasets(data_structures) + assert expected_fragment in excinfo.value.args[0] + + +schema_failing_element_cases = [ + # Dataset error -> message includes "Dataset ''" + ( + { + "datasets": [ + { + "name": "DS_named", + "DataStructure": [{"name": "Id_1", "type": "Integer"}], + } + ] + }, + "Dataset 'DS_named'", + ), + # Scalar error -> message includes "Scalar ''" + ( + {"scalars": [{"name": "sc_named", "type": "BadType"}]}, + "Scalar 'sc_named'", + ), + # Structure error -> message includes "Structure ''" + ( + { + "datasets": [{"name": "DS_1", "structure": "Struct_named"}], + "structures": [ + { + "name": "Struct_named", + "components": [{"name": "Id_1", "role": "BadRole", "data_type": "Integer"}], + } + ], + }, + "Structure 'Struct_named'", + ), +] + + +@pytest.mark.parametrize("data_structures, expected_fragment", schema_failing_element_cases) +def test_data_structure_schema_error_identifies_element(data_structures, expected_fragment): + """The validation error message points to the specific dataset/scalar/structure that failed.""" + with pytest.raises(InputValidationException, match="0-2-1-1") as excinfo: + load_datasets(data_structures) + assert expected_fragment in excinfo.value.args[0] + + +def test_referenced_structure_not_found(): + """A dataset that references a non-existent structure raises a clear error.""" + data_structures = { + "datasets": [{"name": "DS_1", "structure": "Missing"}], + "structures": [ + { + "name": "Other", + "components": [{"name": "Id_1", "data_type": "Integer", "role": "Identifier"}], + } + ], + } + with pytest.raises(InputValidationException, match="0-2-1-1") as excinfo: + load_datasets(data_structures) + assert "Missing" in excinfo.value.args[0]