Skip to content

Commit 1731abb

Browse files
committed
Merge branch 'main' of github.com:linkml/schema-automator into xsd
2 parents 607aab4 + 70fe4eb commit 1731abb

File tree

10 files changed

+2711
-1883
lines changed

10 files changed

+2711
-1883
lines changed

.github/workflows/check-pull-request.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@ jobs:
1717
fail-fast: false
1818
matrix:
1919
os: [ ubuntu-latest, windows-latest ]
20-
python-version: [ "3.9", "3.10" ]
21-
exclude:
22-
- os: windows-latest
23-
python-version: "3.9"
20+
python-version: [ "3.9", "3.10", "3.11", "3.12" , "3.13" ]
2421

2522
runs-on: ${{ matrix.os }}
2623

24+
# Allow Python 3.13 to fail due to scipy not being available yet
25+
continue-on-error: ${{ matrix.python-version == '3.13' }}
26+
2727
steps:
2828

2929
#----------------------------------------------
@@ -57,7 +57,7 @@ jobs:
5757
#----------------------------------------------
5858
- name: Load cached venv
5959
id: cached-poetry-dependencies
60-
uses: actions/cache@v2
60+
uses: actions/cache@v3
6161
with:
6262
path: .venv
6363
key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ tests/outputs/*
1414
venv/
1515
.venv/
1616
target/
17-
local/
17+
local/
18+
.python-version

poetry.lock

Lines changed: 2609 additions & 1852 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ packages = [
3232

3333
[tool.poetry.dependencies]
3434
python = "^3.9"
35-
linkml = "^1.7.4"
35+
linkml = "^1.9.1"
3636
pandas = ">=1.3.5"
3737
python-dateutil = ">=2.8.2"
3838
quantulum3 = ">=0.7.9"
@@ -45,8 +45,8 @@ oaklib = ">=0.5.25"
4545
pandera = ">=0.12.0"
4646
tomlkit = ">=0.11.4"
4747
inflect = ">=6.0.0"
48-
schemasheets = ">=0.1.24"
49-
linkml-runtime = "^1.7.2"
48+
schemasheets = "^0.4.0"
49+
linkml-runtime = "^1.9.2"
5050
duckdb = { version = "^0.10.1", optional = true }
5151
click = "^8.1.7"
5252
deprecated = "^1.2.15"
@@ -61,7 +61,7 @@ pydbml = "^1.1.2"
6161
pyyaml = "^6.0.2"
6262
llm = {version = "^0.21", optional = true}
6363

64-
[tool.poetry.dev-dependencies]
64+
[tool.poetry.group.dev.dependencies]
6565
pytest = ">=7.1.1"
6666
Sphinx = ">=4.4.0"
6767
sphinx-pdj-theme = ">=0.2.1"

schema_automator/annotators/schema_annotator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
import click
22
import logging
3+
import re
34
import yaml
45
from dataclasses import dataclass
56
from typing import List, Union, Iterator
67

78
from linkml_runtime.linkml_model import SchemaDefinition, Element, PermissibleValue, ClassDefinition, SlotDefinition
89
from linkml_runtime.utils.metamodelcore import Curie
9-
from linkml_runtime.utils.schemaview import SchemaView, re, EnumDefinition
10+
from linkml_runtime.utils.schemaview import SchemaView, EnumDefinition
1011
from oaklib import BasicOntologyInterface
1112
from oaklib.datamodels.search import SearchConfiguration
1213
from oaklib.datamodels.text_annotator import TextAnnotation

schema_automator/generalizers/csv_data_generalizer.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import datetime
12
import click
23
import logging
34
import yaml
@@ -644,7 +645,11 @@ def infer_range(slot: dict, vals: set, types: dict, coerce=True) -> str:
644645
return 'boolean'
645646
if all(isfloat(v) for v in nn_vals):
646647
return 'float'
647-
if all(is_date(v) for v in nn_vals):
648+
parsed_datetimes = [is_date_or_datetime(v) for v in nn_vals]
649+
if all(pd == 'date' for pd in parsed_datetimes):
650+
return 'date'
651+
if all(pd in ('date', 'datetime') for pd in parsed_datetimes):
652+
# This selects datetime when values are mixed which may fail validation
648653
return 'datetime'
649654
if is_all_measurement(nn_vals):
650655
return 'measurement'
@@ -691,6 +696,24 @@ def is_date(string, fuzzy=False):
691696
return False
692697

693698

699+
def is_date_or_datetime(string, fuzzy=False):
700+
"""
701+
Return whether the string can be interpreted as a date or datetime.
702+
703+
:param string: str, string to check for date
704+
:param fuzzy: bool, ignore unknown tokens in string if True
705+
"""
706+
try:
707+
dt = parse(string, fuzzy=fuzzy)
708+
if dt.hour == 0 and dt.minute == 0 and dt.second == 0:
709+
return 'date'
710+
return 'datetime'
711+
except Exception:
712+
# https://stackoverflow.com/questions/4990718/how-can-i-write-a-try-except-block-that-catches-all-exceptions
713+
# we don't know all the different parse exceptions, we assume any error means this is not a date
714+
return False
715+
716+
694717
@dataclass
695718
class Hit:
696719
term_id: str

schema_automator/importers/jsonschema_import_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def import_project(self, import_directory: str, export_directory: str, match_suf
101101
if rng in schema.classes:
102102
# no need to self-import
103103
continue
104-
import_module_name = class_name_to_module_map[rng]
104+
import_module_name = rel + class_name_to_module_map[rng]
105105
if import_module_name not in schema.imports:
106106
logging.info(f"Adding import to {import_module_name} in {schema.name} for {rng}")
107107
schema.imports.append(import_module_name)

schema_automator/importers/rdfs_import_engine.py

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import logging
22
from pathlib import Path
3-
from typing import Dict, Iterable, List, Any, Mapping, TextIO
3+
from typing import Any, Dict, Iterable, List, Mapping, Optional, TextIO, Union
44
import typing
55
from collections import defaultdict, Counter
6+
import warnings
67

78
from jsonasobj2 import JsonObj
89
from linkml.utils.schema_builder import SchemaBuilder
@@ -51,7 +52,7 @@ class RdfsImportEngine(ImportEngine):
5152
#: Mapping from field names in this RDF schema (e.g. `price`) to IRIs (e.g. `http://schema.org/price`)
5253
mappings: Dict[str, URIRef] = field(default_factory=dict)
5354
#: User-defined mapping from LinkML metamodel slots (such as `domain_of`) to RDFS IRIs (such as http://schema.org/domainIncludes)
54-
initial_metamodel_mappings: Dict[str, URIRef | List[URIRef]] = field(default_factory=dict)
55+
initial_metamodel_mappings: Dict[str, Union[URIRef, List[URIRef]]] = field(default_factory=dict)
5556
#: Combined mapping from LinkML metamodel slots to RDFS IRIs
5657
metamodel_mappings: Dict[str, List[URIRef]] = field(default_factory=lambda: defaultdict(list))
5758
#: Reverse of `metamodel_mappings`, but supports multiple terms mapping to the same IRI
@@ -97,12 +98,12 @@ def __post_init__(self):
9798

9899
def convert(
99100
self,
100-
file: str | Path | TextIO,
101-
name: str | None = None,
102-
format: str | None="turtle",
103-
default_prefix: str | None = None,
104-
model_uri: str | None = None,
105-
identifier: str | None = None,
101+
file: Union[str, Path, TextIO],
102+
name: Optional[str] = None,
103+
format: Optional[str] = "turtle",
104+
default_prefix: Optional[str] = None,
105+
model_uri: Optional[str] = None,
106+
identifier: Optional[str] = None,
106107
**kwargs: Any,
107108
) -> SchemaDefinition:
108109
"""
@@ -130,7 +131,10 @@ def convert(
130131
cls_slots = defaultdict(list)
131132

132133
for slot in self.generate_rdfs_properties(g, cls_slots):
133-
sb.add_slot(slot)
134+
if slot.name in sb.schema.slots:
135+
logging.warning(f"Slot '{slot.name}' already exists in schema; skipping duplicate.")
136+
else:
137+
sb.add_slot(slot)
134138
for cls in self.process_rdfs_classes(g, cls_slots):
135139
sb.add_class(cls)
136140

@@ -151,9 +155,16 @@ def convert(
151155
schema.prefixes = {key: value for key, value in schema.prefixes.items() if key in self.seen_prefixes}
152156
self.infer_metadata(schema, name, default_prefix, model_uri)
153157
self.fix_missing(schema)
158+
self._normalize_slot_ranges(schema)
154159
return schema
155160

156-
def infer_metadata(self, schema: SchemaDefinition, name: str | None, default_prefix: str | None = None, model_uri: str | None = None):
161+
def infer_metadata(
162+
self,
163+
schema: SchemaDefinition,
164+
name: Optional[str] = None,
165+
default_prefix: Optional[str] = None,
166+
model_uri: Optional[str] = None,
167+
):
157168
top_count = self.prefix_counts.most_common(1)
158169
if len(top_count) == 0:
159170
raise ValueError("No prefixes found in the graph")
@@ -313,7 +324,7 @@ def _dict_for_subject(self, g: Graph, s: URIRef, subject_type: typing.Literal["s
313324
def _rdfs_metamodel_iri(self, name: str) -> List[URIRef]:
314325
return self.metamodel_mappings.get(name, [])
315326

316-
def _element_from_iri(self, iri: URIRef) -> str | None:
327+
def _element_from_iri(self, iri: URIRef) -> Optional[str]:
317328
r = self.reverse_metamodel_mappings.get(iri, [])
318329
if len(r) > 0:
319330
if len(r) > 1:
@@ -341,3 +352,25 @@ def _as_name(self, v: URIRef) -> str:
341352
if sep in v_str:
342353
return v_str.split(sep)[-1]
343354
return v_str
355+
356+
def _normalize_slot_ranges(self, schema: SchemaDefinition) -> None:
357+
"""
358+
Normalize slot ranges to valid LinkML scalars where needed.
359+
Currently supports remapping RDF types like 'langString'.
360+
"""
361+
RDF_DATATYPE_MAP = {
362+
"langString": "string",
363+
"Text": "string",
364+
"Thing": "string",
365+
"landingPage": "string",
366+
"Boolean": "boolean",
367+
"Number": "integer",
368+
"URL": "uri",
369+
}
370+
371+
for slot in schema.slots.values():
372+
if slot.range in RDF_DATATYPE_MAP:
373+
warnings.warn(
374+
f"Slot '{slot.name}' has unsupported range '{slot.range}'; mapping to '{RDF_DATATYPE_MAP[slot.range]}'."
375+
)
376+
slot.range = RDF_DATATYPE_MAP[slot.range]

tests/test_generalizers/test_csv_data_generalizer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ def test_infer_range(self):
6868
(['5.999', '7.955', '7.990', '6.990'], "float"),
6969
(["2mm", "3m", "4 mm"], "measurement"),
7070
(["true", "false"], "boolean"),
71+
(["2024-01-01", "2023-12-31"], "date"),
72+
(["2024-01-01T12:30:00", "2023-12-31T08:15:00"], "datetime"),
73+
(["2024-01-01", "2023-12-31T08:15:00"], "datetime"),
74+
(["2024-01-01", "not-a-date"], "string"),
7175
]
7276
for values, expected in cases:
7377
self.assertEqual(infer_range({}, values, {}), expected, f"Failed on {values}")

tests/test_importers/test_rdfs_importer.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,22 @@
11
# -*- coding: utf-8 -*-
22

3-
"""Test the module can be imported."""
3+
# Monkey patching jsonobj to fix windows issue
4+
import platform
5+
6+
if platform.system() == "Windows":
7+
from jsonasobj2 import JsonObj
8+
if not hasattr(JsonObj, 'values'):
9+
def _values(self):
10+
return {
11+
k: v for k, v in self.__dict__.items()
12+
if not callable(v) and not k.startswith('_')
13+
}.values()
14+
JsonObj.values = _values
415

516
from io import StringIO
617
import unittest
718
import os
19+
import pytest
820
import yaml
921
from linkml_runtime import SchemaView
1022

@@ -19,13 +31,10 @@
1931
OUTSCHEMA = os.path.join(OUTPUT_DIR, 'reproschema-from-ttl.yaml')
2032
FOAF = os.path.join(INPUT_DIR, 'foaf_snippet.ttl')
2133

22-
2334
def test_import_foaf():
2435
engine = RdfsImportEngine()
2536
schema = engine.convert(FOAF)
2637
sv = SchemaView(schema)
27-
assert len(sv.all_classes()) == 3
28-
assert len(sv.all_slots()) == 1
2938
assert sv.get_slot("knows").range == "Person"
3039
assert sv.schema.default_prefix == "foaf"
3140
assert "foaf" in sv.schema.prefixes
@@ -80,6 +89,6 @@ def test_from_rdfs():
8089
assert activity.name == "Activity"
8190
assert activity.is_a == "CreativeWork"
8291
slots = sv.class_induced_slots(activity.name)
83-
assert len(slots) == 1
84-
slot = slots[0]
85-
assert slot.name == "id"
92+
assert len(slots) == 18
93+
slot_names = [s.name for s in slots]
94+
assert "messages" in slot_names

0 commit comments

Comments
 (0)