Skip to content

Commit ff055cf

Browse files
ghuronclaude
andcommitted
Redesign SourceItem.extract() return contract
- extract() now always returns SourceItem (never None) - Add new_ident field: set on redirect, replaces patch[0] convention - parse() is now void and receives ident as parameter - Connectors set self.prior_ident directly when primary-property lookup confirms absence (ADS 0-doc bibcode, EuropePMC PMID, ArxivItem P818, ORCID missing path) - item.py transform() uses new_ident instead of patch[0] for replacement Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent a94c3ff commit ff055cf

File tree

14 files changed

+223
-83
lines changed

14 files changed

+223
-83
lines changed

src/wdpy/connectors/ads.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@ def make_request(cls, ident: Statement) -> Optional[Request]:
1515
headers={'Authorization': 'Bearer ogOoi0uDxIebyeseB3tAbf5mBTJxXQQWQqE5TW40'}
1616
)
1717

18-
def parse(self, text: str) -> bool:
18+
def parse(self, text: str, ident: Statement) -> None:
1919
docs = json.loads(text).get('response', {}).get('docs', [])
2020
if len(docs) != 1:
21-
if self.patch and self.patch[0].mainsnak.property == 'P819':
22-
logging.warning('Got %d results for %s', len(docs), self.patch[0].mainsnak.value)
23-
self.patch = []
24-
return True
25-
return False
21+
if ident.mainsnak.property == 'P819':
22+
logging.warning('Got %d results for %s', len(docs), ident.mainsnak.value)
23+
if len(docs) == 0:
24+
self.prior_ident = ident
25+
return
2626
d = docs[0]
2727
if 'page' in d:
2828
p = d['page'][0] if isinstance(d['page'], list) else d['page']
@@ -36,13 +36,12 @@ def parse(self, text: str) -> bool:
3636
self.obtain(d, self._config.get('fields', {}), self._config.get('translate', {}))
3737
for i, name in enumerate(d.get('author', []), 1):
3838
self.add_author(name, _get_orcid(d, i - 1))
39-
for ident in d.get('identifier', []):
40-
if ident.startswith('arXiv:'):
41-
self.add_claim('P818', ident[6:])
42-
elif ident.startswith('10.') and 'ARXIV' not in ident.upper():
43-
self.add_claim('P356', ident)
39+
for identifier in d.get('identifier', []):
40+
if identifier.startswith('arXiv:'):
41+
self.add_claim('P818', identifier[6:])
42+
elif identifier.startswith('10.') and 'ARXIV' not in identifier.upper():
43+
self.add_claim('P356', identifier)
4444
self.add_claim('P31', 'Q13442814')
45-
return True
4645

4746

4847
def _get_orcid(d: dict, idx: int) -> Optional[str]:

src/wdpy/connectors/arxiv.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
from xml.etree import ElementTree
2-
from wdpy import SourceItem
2+
from wdpy import SourceItem, Statement
33

44
class ArxivItem(SourceItem):
5-
def parse(self, text: str) -> bool:
5+
def parse(self, text: str, ident: Statement) -> None:
66
ns = self._config['namespaces']
77
if (entry := ElementTree.fromstring(text).find('w3:entry', ns)) is None:
8-
self.patch = []
9-
return True
8+
if ident.mainsnak.property == 'P818':
9+
self.prior_ident = ident
10+
return
1011
id_text = getattr(entry.find('w3:id', ns), 'text', '') or ''
11-
if self.patch and self.patch[0].mainsnak.property == 'P818' and '/abs/' not in id_text:
12-
return False
12+
if ident.mainsnak.property == 'P818' and '/abs/' not in id_text:
13+
self.prior_ident = ident
14+
return
1315
arxiv_id = id_text.split('/')[-1].split('v')[0]
1416
self.add_claim('P31', 'Q13442814')
1517
if arxiv_id: self.add_claim('P953', 'https://arxiv.org/pdf/' + arxiv_id)
@@ -22,8 +24,3 @@ def parse(self, text: str) -> bool:
2224
self.add_claim('P2093', name).set_qualifier('P1545', str(i := i + 1))
2325
if len(doi := entry.findall('arxiv:doi', ns)) == 1 and doi[0].text:
2426
self.add_claim('P356', doi[0].text.upper())
25-
return True
26-
27-
# r = ArxivItem.extract(Statement(Snak('P818', ('1309.0951',))))
28-
# r = ArxivItem.extract(Statement(Snak('P356', ('10.4171/161',))))
29-
# pass

src/wdpy/connectors/crossref.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import json
2-
from wdpy import SourceItem
2+
from wdpy import SourceItem, Statement
33

44

55
class Crossref(SourceItem):
6-
def parse(self, text: str) -> bool:
6+
def parse(self, text: str, ident: Statement) -> None:
77
self.obtain(json.loads(text), self._config.get('fields', {}), self._config.get('translate', {}))
8-
return True

src/wdpy/connectors/europepmc.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
import json
2-
from wdpy import SourceItem
2+
from wdpy import SourceItem, Statement
33

44

55
class EuropePMC(SourceItem):
6-
def parse(self, text: str) -> bool:
6+
def parse(self, text: str, ident: Statement) -> None:
77
results = json.loads(text).get('resultList', {}).get('result', [])
88
if len(results) != 1:
9-
if self.patch and self.patch[0].mainsnak.property == 'P698':
10-
return False
11-
self.patch = []
12-
return True
9+
if ident.mainsnak.property == 'P698':
10+
self.prior_ident = ident
11+
return
1312
d = results[0]
1413
fields = self._config.get('fields', {})
1514
translate = self._config.get('translate', {})
@@ -19,4 +18,3 @@ def parse(self, text: str) -> bool:
1918
for a in d.get('authorList', {}).get('author', []):
2019
orcid = a['authorId']['value'] if a.get('authorId', {}).get('type') == 'ORCID' else None
2120
self.add_author(a.get('firstName', '') + ' ' + a.get('lastName', ''), orcid)
22-
return True

src/wdpy/connectors/orcid.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,15 @@ def make_request(cls, ident: Statement) -> Optional[Request]:
1111
req.add_header('Accept', 'application/json')
1212
return req
1313

14-
def parse(self, text: str) -> bool:
14+
def parse(self, text: str, ident: Statement) -> None:
1515
data = json.loads(text)
1616
if 'path' not in data:
17-
self.patch = []
18-
return True
17+
self.prior_ident = ident
18+
return
1919
self.add_claim('P496', data['path'].split('/')[1])
2020
self.add_claim('P31', 'Q5')
2121
self.add_claim('P106', 'Q1650915')
2222
for i in data.get('external-identifiers', {}).get('external-identifier', []):
2323
prop = self._config.get('fields', {}).get(i.get('external-id-type'))
2424
if prop:
2525
self.add_claim(prop, i.get('external-id-value'))
26-
return True

src/wdpy/item.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,9 @@ def _extract_new(self, transformed: List[SourceItem]) -> Optional[SourceItem]:
6161
if stmt.rank == 'deprecated' or not stmt.mainsnak.value:
6262
continue
6363
for model_type in SourceItem.get_extractors():
64-
if (model := model_type.extract(stmt)) is None:
64+
model = model_type.extract(stmt)
65+
if not (model.patch or model.prior_ident):
6566
continue
66-
if model.patch:
67-
s = model.patch[0]
68-
if s.mainsnak.value and (s.mainsnak.property, s.mainsnak.value[0]) in deprecated:
69-
continue
7067
if model not in transformed:
7168
return model
7269
return None
@@ -112,11 +109,24 @@ def _ensure_loaded(self) -> None:
112109
self._loaded = True
113110

114111
def transform(self, model: SourceItem) -> None:
115-
if not isinstance(model, SourceItem) or model.patch is None:
112+
if not isinstance(model, SourceItem):
113+
return
114+
if model.patch is None and not model.prior_ident and not model.new_ident:
116115
return
117116
self._ensure_loaded()
118117
source = model.get_db_ref()
119118

119+
if model.prior_ident:
120+
prop = model.prior_ident.mainsnak.property
121+
old_val = model.prior_ident.mainsnak.value
122+
for stmt in self.claims.get(prop) or []:
123+
if stmt.mainsnak.value == old_val:
124+
if model.new_ident:
125+
stmt.mainsnak.value = model.new_ident.mainsnak.value
126+
else:
127+
stmt.set_rank('deprecated', 'Q21441764')
128+
break
129+
120130
if not model.patch:
121131
for prop in model.get_properties():
122132
for stmt in self.claims.get(prop) or []:

src/wdpy/source_item.py

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ def request(url: str,
3333
class SourceItem:
3434
patch: Optional[List[Statement]] = None
3535
proposed_label: Optional[str] = None
36+
prior_ident: Optional[Statement] = None
37+
new_ident: Optional[Statement] = None
3638
_lookup_cache: ClassVar[Dict[str, Dict[Any, Optional[str]]]] = {}
3739
_registry: ClassVar[List[type]] = []
3840
_connectors_loaded: ClassVar[bool] = False
@@ -145,37 +147,50 @@ def register_new_item(statements: List[Statement]) -> None:
145147
property_id, value, qid, SourceItem._lookup_cache[property_id][value])
146148
SourceItem._lookup_cache[property_id][value] = qid
147149

148-
def parse(self, text: str) -> bool:
150+
def parse(self, text: str, ident: Statement) -> None:
149151
raise NotImplementedError('Subclasses must implement parse')
150152

151153
@classmethod
152-
def extract(cls, ident: Statement) -> Optional[SourceItem]:
153-
"""Returns None on retrieval failure; instance with empty patch if datasource
154-
confirms the record does not exist; instance with non-empty patch where
155-
patch[0] is the requested id (possibly updated from a redirect)."""
154+
def extract(cls, ident: Statement) -> SourceItem:
155+
"""Fetch and parse the record for `ident`. Always returns a SourceItem.
156+
157+
Empty (patch/prior_ident/new_ident all None): no info — network error,
158+
unrecognised property, or parse failure.
159+
160+
prior_ident set, new_ident None: ident confirmed gone; caller should
161+
deprecate that statement.
162+
163+
prior_ident and new_ident set: ident redirected; caller should replace
164+
old value with new_ident. patch carries any additional claims.
165+
166+
Only patch set: ident unchanged; patch carries new claims to merge.
167+
"""
156168
if not (req := cls.make_request(ident)):
157-
return None
169+
return cls()
158170
url = req.full_url
159171
try:
160172
resp = build_opener().open(req, timeout=30)
161173
except urllib.error.HTTPError as e:
162174
resp = e
163175
except Exception as e:
164176
logging.error('Request failed for %s: %s', url, e)
165-
return None
177+
return cls()
166178
handled = cls._config.get('extract', [])
167179
with resp:
168180
if ((code := resp.getcode()) == 404) and (404 in handled):
169-
return cls(patch=[])
181+
first_prop = next(iter(cls._config.get('properties', {})), None)
182+
if ident.mainsnak.property == first_prop:
183+
return cls(prior_ident=ident)
184+
return cls()
170185
destination = resp.geturl()
171186
text = resp.read().decode('utf-8')
172187
if code != 200:
173188
logging.error(f'Returned code {code} for {url}, message: {text}')
174189
destination_url = destination or url
175190
redirected = 301 in handled and destination_url.lower() != url.lower()
191+
item = cls()
176192
if redirected:
177-
patch_ident = cls.update_ident(ident, destination_url)
178-
else:
179-
patch_ident = ident
180-
item = cls(patch=[patch_ident])
181-
return item if item.parse(text) else None
193+
item.prior_ident = ident
194+
item.new_ident = cls.update_ident(ident, destination_url)
195+
item.parse(text, ident)
196+
return item

tests/integration/test_ads.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,30 @@
66
class TestExtract(TestCase):
77
def test_doi_returns_result(self):
88
result = ADS.extract(Statement(Snak('P356', ('10.1088/2041-8205/763/1/L1',))))
9-
self.assertEqual(result.patch[0].mainsnak.property, 'P356')
9+
self.assertTrue(result.patch)
1010

1111
def test_bibcode_returns_result(self):
1212
result = ADS.extract(Statement(Snak('P819', ('2013ApJ...763L...1M',))))
13-
self.assertEqual(result.patch[0].mainsnak.property, 'P819')
13+
self.assertTrue(result.patch)
1414

1515
def test_arxiv_returns_result(self):
1616
result = ADS.extract(Statement(Snak('P818', ('1212.1162',))))
17-
self.assertEqual(result.patch[0].mainsnak.property, 'P818')
17+
self.assertTrue(result.patch)
1818

19-
def test_nonexistent_bibcode_returns_empty_patch(self):
19+
def test_nonexistent_bibcode_sets_prior_ident(self):
2020
result = ADS.extract(Statement(Snak('P819', ('X',))))
21-
self.assertEqual(result.patch, [])
21+
self.assertIsNotNone(result.prior_ident)
22+
self.assertIsNone(result.patch)
23+
self.assertIsNone(result.new_ident)
2224

23-
def test_nonexistent_doi_returns_none(self):
24-
self.assertIsNone(ADS.extract(Statement(Snak('P356', ('X',)))))
25+
def test_nonexistent_doi_returns_empty(self):
26+
result = ADS.extract(Statement(Snak('P356', ('X',))))
27+
self.assertIsNone(result.prior_ident)
28+
self.assertIsNone(result.patch)
29+
self.assertIsNone(result.new_ident)
2530

26-
def test_nonexistent_arxiv_returns_none(self):
27-
self.assertIsNone(ADS.extract(Statement(Snak('P818', ('X',)))))
31+
def test_nonexistent_arxiv_returns_empty(self):
32+
result = ADS.extract(Statement(Snak('P818', ('X',))))
33+
self.assertIsNone(result.prior_ident)
34+
self.assertIsNone(result.patch)
35+
self.assertIsNone(result.new_ident)

tests/integration/test_arxiv.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,20 @@
77
class TestExtract(TestCase):
88
def test_arxiv_id_returns_result(self):
99
result = ArxivItem.extract(Statement(Snak('P818', ('1309.0951',))))
10-
self.assertEqual(result.patch[0].mainsnak.property, 'P818')
10+
self.assertTrue(result.patch)
1111

1212
def test_doi_returns_result(self):
1313
result = ArxivItem.extract(Statement(Snak('P356', ('10.4171/161',))))
14-
self.assertEqual(result.patch[0].mainsnak.property, 'P356')
14+
self.assertTrue(result.patch)
1515

16-
def test_nonexistent_arxiv_id_returns_none(self):
17-
self.assertIsNone(ArxivItem.extract(Statement(Snak('P818', ('X',)))))
16+
def test_nonexistent_arxiv_id_sets_prior_ident(self):
17+
result = ArxivItem.extract(Statement(Snak('P818', ('X',))))
18+
self.assertIsNotNone(result.prior_ident)
19+
self.assertIsNone(result.patch)
20+
self.assertIsNone(result.new_ident)
1821

19-
def test_nonexistent_doi_returns_empty_patch(self):
22+
def test_nonexistent_doi_returns_empty(self):
2023
result = ArxivItem.extract(Statement(Snak('P356', ('10.99999/nonexistent',))))
21-
self.assertEqual(result.patch, [])
24+
self.assertIsNone(result.prior_ident)
25+
self.assertIsNone(result.patch)
26+
self.assertIsNone(result.new_ident)

tests/integration/test_crossref.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
class TestExtract(TestCase):
77
def test_doi_returns_result(self):
88
result = Crossref.extract(Statement(Snak('P356', ('10.1088/2041-8205/763/1/L1',))))
9-
self.assertEqual(result.patch[0].mainsnak.property, 'P356')
9+
self.assertTrue(result.patch)
1010

11-
def test_nonexistent_doi_returns_empty_patch(self):
11+
def test_nonexistent_doi_sets_prior_ident(self):
1212
result = Crossref.extract(Statement(Snak('P356', ('X',))))
13-
self.assertEqual(result.patch, [])
13+
self.assertIsNotNone(result.prior_ident)
14+
self.assertIsNone(result.patch)
15+
self.assertIsNone(result.new_ident)

0 commit comments

Comments
 (0)