Skip to content

Commit 730689a

Browse files
ghuronclaude
andcommitted
Simplify deprecate_ident() and ads parse() identifier loop
- Remove dead `if reason:` guard in deprecate_ident() — default is always truthy, no caller passes falsy - Pass ident.qualifiers directly instead of copying via list() - Assign d.get('identifier', []) once in ADS.parse() and inline the DOI filter into the any() predicate Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent d7f9669 commit 730689a

16 files changed

Lines changed: 164 additions & 132 deletions

src/wdpy/connectors/ads.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def parse(self, text: str, ident: Statement) -> None:
2121
if ident.mainsnak.property == 'P819':
2222
logging.warning('Got %d results for %s', len(docs), ident.mainsnak.value)
2323
if len(docs) == 0:
24-
self.prior_ident = ident
24+
self.deprecate_ident(ident)
2525
return
2626
d = docs[0]
2727
if 'page' in d:
@@ -39,18 +39,22 @@ def parse(self, text: str, ident: Statement) -> None:
3939
if ident.mainsnak.property == 'P819':
4040
bibcode = d.get('bibcode')
4141
if bibcode and bibcode != ident.mainsnak.value[0]:
42-
self.prior_ident = ident
43-
self.new_ident = Statement(Snak('P819', (bibcode,)))
42+
self.deprecate_ident(ident)
43+
self.add_claim('P819', bibcode)
4444
fields = {k: v for k, v in fields.items() if k != 'bibcode'}
4545

4646
self.obtain(d, fields, translate)
4747
for i, name in enumerate(d.get('author', []), 1):
4848
self.add_author(name, _get_orcid(d, i - 1))
49-
for identifier in d.get('identifier', []):
49+
identifiers = d.get('identifier', [])
50+
has_journal_doi = any(i.startswith('10.') and 'ARXIV' not in i.upper() for i in identifiers)
51+
for identifier in identifiers:
5052
if identifier.startswith('arXiv:'):
5153
self.add_claim('P818', identifier[6:])
52-
elif identifier.startswith('10.') and 'ARXIV' not in identifier.upper():
53-
self.add_claim('P356', identifier)
54+
elif identifier.startswith('10.'):
55+
s = self.add_claim('P356', identifier)
56+
if has_journal_doi and 'ARXIV' in identifier.upper():
57+
s.rank = 'deprecated'
5458
self.add_claim('P31', 'Q13442814')
5559

5660

src/wdpy/connectors/arxiv.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ def parse(self, text: str, ident: Statement) -> None:
66
ns = self._config['namespaces']
77
if (entry := ElementTree.fromstring(text).find('w3:entry', ns)) is None:
88
if ident.mainsnak.property == 'P818':
9-
self.prior_ident = ident
9+
self.deprecate_ident(ident)
1010
return
1111
id_text = getattr(entry.find('w3:id', ns), 'text', '') or ''
1212
if ident.mainsnak.property == 'P818' and '/abs/' not in id_text:
13-
self.prior_ident = ident
13+
self.deprecate_ident(ident)
1414
return
1515
arxiv_id = id_text.split('/')[-1].split('v')[0]
1616
self.add_claim('P31', 'Q13442814')

src/wdpy/connectors/europepmc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ def parse(self, text: str, ident: Statement) -> None:
77
results = json.loads(text).get('resultList', {}).get('result', [])
88
if len(results) != 1:
99
if ident.mainsnak.property == 'P698':
10-
self.prior_ident = ident
10+
self.deprecate_ident(ident)
1111
return
1212
d = results[0]
1313
fields = self._config.get('fields', {})

src/wdpy/connectors/exoplanet.py

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
from __future__ import annotations
22
from typing import Optional
33
from wdpy.connectors import AstroItem
4-
from wdpy import Snak, Statement
54
import urllib.parse
65

76

8-
class ExoplanetItem(AstroItem):
7+
class ExoplanetItem(AstroItem):
98
@staticmethod
109
def _extract_catalog_id(url: Optional[str]) -> Optional[str]:
1110
if not isinstance(url, str):
@@ -17,19 +16,3 @@ def _extract_catalog_id(url: Optional[str]) -> Optional[str]:
1716
return None
1817
return parts[-1]
1918

20-
@classmethod
21-
def update_ident(cls, ident: Statement, url: str) -> Statement:
22-
target = cls._extract_catalog_id(url)
23-
if not target:
24-
raise ValueError(f'Cannot extract exoplanet id from {url}')
25-
snak = ident.mainsnak if isinstance(ident, Statement) else None
26-
if not isinstance(snak, Snak):
27-
raise TypeError('Invalid statement mainsnak for redirect update')
28-
return Statement(
29-
Snak(snak.property, (target,), snak.snaktype),
30-
id=ident.id,
31-
rank=ident.rank,
32-
qualifiers=ident.qualifiers,
33-
references=ident.references,
34-
)
35-

src/wdpy/connectors/orcid.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def make_request(cls, ident: Statement) -> Optional[Request]:
1414
def parse(self, text: str, ident: Statement) -> None:
1515
data = json.loads(text)
1616
if 'path' not in data:
17-
self.prior_ident = ident
17+
self.deprecate_ident(ident)
1818
return
1919
self.add_claim('P496', data['path'].split('/')[1])
2020
self.add_claim('P31', 'Q5')

src/wdpy/item.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def _extract_new(self, transformed: List[SourceItem]) -> Optional[SourceItem]:
6262
continue
6363
for model_type in SourceItem.get_extractors():
6464
model = model_type.extract(stmt)
65-
if not (model.patch or model.prior_ident):
65+
if not model.patch:
6666
continue
6767
if model not in transformed:
6868
return model
@@ -111,22 +111,11 @@ def _ensure_loaded(self) -> None:
111111
def transform(self, model: SourceItem) -> None:
112112
if not isinstance(model, SourceItem):
113113
return
114-
if model.patch is None and not model.prior_ident and not model.new_ident:
114+
if model.patch is None:
115115
return
116116
self._ensure_loaded()
117117
source = model.get_db_ref()
118118

119-
if model.prior_ident:
120-
prop = model.prior_ident.mainsnak.property
121-
old_val = model.prior_ident.mainsnak.value
122-
for stmt in self.claims.get(prop) or []:
123-
if stmt.mainsnak.value == old_val:
124-
if model.new_ident:
125-
stmt.mainsnak.value = model.new_ident.mainsnak.value
126-
else:
127-
stmt.set_rank('deprecated', 'Q21441764')
128-
break
129-
130119
if not model.patch:
131120
for prop in model.get_properties():
132121
for stmt in self.claims.get(prop) or []:

src/wdpy/source_item.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ def request(url: str,
3333
class SourceItem:
3434
patch: Optional[List[Statement]] = None
3535
proposed_label: Optional[str] = None
36-
prior_ident: Optional[Statement] = None
37-
new_ident: Optional[Statement] = None
3836
_lookup_cache: ClassVar[Dict[str, Dict[Any, Optional[str]]]] = {}
3937
_registry: ClassVar[List[type]] = []
4038
_connectors_loaded: ClassVar[bool] = False
@@ -104,6 +102,14 @@ def add_claim(self, property_id: str, *value: str) -> Statement:
104102
self.proposed_label = value[0]
105103
return s
106104

105+
def deprecate_ident(self, ident: Statement, reason: str = 'Q21441764') -> None:
106+
"""Append ident as a deprecated statement to patch."""
107+
s = Statement(ident.mainsnak, ident.id, 'deprecated', ident.qualifiers)
108+
s.set_qualifier('P2241', reason)
109+
if self.patch is None:
110+
self.patch = []
111+
self.patch.append(s)
112+
107113
def obtain(self, source: dict, properties: dict, translate: dict = {}) -> None:
108114
for key, prop in properties.items():
109115
if (val := source.get(key)) is None or not prop:
@@ -154,16 +160,14 @@ def parse(self, text: str, ident: Statement) -> None:
154160
def extract(cls, ident: Statement) -> SourceItem:
155161
"""Fetch and parse the record for `ident`. Always returns a SourceItem.
156162
157-
Empty (patch/prior_ident/new_ident all None): no info — network error,
158-
unrecognised property, or parse failure.
159-
160-
prior_ident set, new_ident None: ident confirmed gone; caller should
161-
deprecate that statement.
163+
Empty (patch None): no info — network error, unrecognised property,
164+
or parse failure.
162165
163-
prior_ident and new_ident set: ident redirected; caller should replace
164-
old value with new_ident. patch carries any additional claims.
166+
patch set with a deprecated statement: ident confirmed gone or
167+
redirected; patch may also carry additional claims.
165168
166-
Only patch set: ident unchanged; patch carries new claims to merge.
169+
Only normal statements in patch: ident unchanged; patch carries new
170+
claims to merge.
167171
"""
168172
if not (req := cls.make_request(ident)):
169173
return cls()
@@ -180,7 +184,9 @@ def extract(cls, ident: Statement) -> SourceItem:
180184
if ((code := resp.getcode()) == 404) and (404 in handled):
181185
first_prop = next(iter(cls._config.get('properties', {})), None)
182186
if ident.mainsnak.property == first_prop:
183-
return cls(prior_ident=ident)
187+
item = cls()
188+
item.deprecate_ident(ident)
189+
return item
184190
return cls()
185191
destination = resp.geturl()
186192
text = resp.read().decode('utf-8')
@@ -190,7 +196,8 @@ def extract(cls, ident: Statement) -> SourceItem:
190196
redirected = 301 in handled and destination_url.lower() != url.lower()
191197
item = cls()
192198
if redirected:
193-
item.prior_ident = ident
194-
item.new_ident = cls.update_ident(ident, destination_url)
199+
new_ident = cls.update_ident(ident, destination_url)
200+
item.patch = [new_ident]
201+
item.deprecate_ident(ident)
195202
item.parse(text, ident)
196203
return item

src/wdpy/statement.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,8 @@ def upsert(self, existing: List[Statement]) -> Optional[Statement]:
110110
continue
111111
if self._qualifiers_present_in(candidate):
112112
self._merge_references_into(candidate)
113+
if self.rank is not None:
114+
candidate.rank = self.rank
113115
return candidate
114116
return None
115117

tests/integration/test_ads.py

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,28 +16,55 @@ def test_arxiv_returns_result(self):
1616
result = ADS.extract(Statement(Snak('P818', ('1212.1162',))))
1717
self.assertTrue(result.patch)
1818

19-
def test_nonexistent_bibcode_sets_prior_ident(self):
20-
result = ADS.extract(Statement(Snak('P819', ('X',))))
21-
self.assertIsNotNone(result.prior_ident)
22-
self.assertIsNone(result.patch)
23-
self.assertIsNone(result.new_ident)
19+
def test_nonexistent_bibcode_sets_deprecated_patch(self):
20+
ident = Statement(Snak('P819', ('X',)))
21+
result = ADS.extract(ident)
22+
self.assertIsNotNone(result.patch)
23+
deprecated = [s for s in result.patch if s.rank == 'deprecated']
24+
self.assertTrue(deprecated)
25+
self.assertIs(deprecated[0].mainsnak, ident.mainsnak)
26+
self.assertFalse(hasattr(result, 'prior_ident'))
2427

2528
def test_nonexistent_doi_returns_empty(self):
2629
result = ADS.extract(Statement(Snak('P356', ('X',))))
27-
self.assertIsNone(result.prior_ident)
2830
self.assertIsNone(result.patch)
29-
self.assertIsNone(result.new_ident)
3031

3132
def test_nonexistent_arxiv_returns_empty(self):
3233
result = ADS.extract(Statement(Snak('P818', ('X',))))
33-
self.assertIsNone(result.prior_ident)
3434
self.assertIsNone(result.patch)
35-
self.assertIsNone(result.new_ident)
3635

3736
def test_bibcode_redirect(self):
38-
ident = Statement(Snak('P819', ('2023arXiv230313424H',)))
39-
result = ADS.extract(ident)
40-
self.assertIs(result.prior_ident, ident)
41-
self.assertEqual(result.new_ident.mainsnak.property, 'P819')
42-
self.assertEqual(result.new_ident.mainsnak.value, ('2023A&A...673A.114H',))
43-
self.assertTrue(result.patch)
37+
expected = {
38+
'2023A&A...673A.114H': ('P819', False),
39+
'2023arXiv230313424H': ('P819', True),
40+
'2303.13424': ('P818', False),
41+
'10.1051/0004-6361/202346285': ('P356', False),
42+
'10.48550/arXiv.2303.13424': ('P356', True),
43+
}
44+
result = ADS.extract(Statement(Snak('P819', ('2023arXiv230313424H',))))
45+
for s in (result.patch or []):
46+
if Snak.type_of(s.mainsnak.property) != 'external-id':
47+
continue
48+
value = (s.mainsnak.value or (None,))[0]
49+
self.assertIn(value, expected)
50+
prop, deprecated = expected.pop(value)
51+
self.assertEqual(s.mainsnak.property, prop, value)
52+
self.assertEqual(s.rank, 'deprecated' if deprecated else None, value)
53+
self.assertFalse(expected, f'Missing from patch: {list(expected)}')
54+
55+
def test_arxiv_only_bibcode(self):
56+
expected = {
57+
'2008arXiv0812.5116B': ('P819', False),
58+
'0812.5116': ('P818', False),
59+
'10.48550/arXiv.0812.5116': ('P356', False),
60+
}
61+
result = ADS.extract(Statement(Snak('P819', ('2008arXiv0812.5116B',))))
62+
for s in (result.patch or []):
63+
if Snak.type_of(s.mainsnak.property) != 'external-id':
64+
continue
65+
value = (s.mainsnak.value or (None,))[0]
66+
self.assertIn(value, expected)
67+
prop, deprecated = expected.pop(value)
68+
self.assertEqual(s.mainsnak.property, prop, value)
69+
self.assertEqual(s.rank, 'deprecated' if deprecated else None, value)
70+
self.assertFalse(expected, f'Missing from patch: {list(expected)}')

tests/integration/test_arxiv.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,16 @@ def test_doi_returns_result(self):
1313
result = ArxivItem.extract(Statement(Snak('P356', ('10.4171/161',))))
1414
self.assertTrue(result.patch)
1515

16-
def test_nonexistent_arxiv_id_sets_prior_ident(self):
17-
result = ArxivItem.extract(Statement(Snak('P818', ('X',))))
18-
self.assertIsNotNone(result.prior_ident)
19-
self.assertIsNone(result.patch)
20-
self.assertIsNone(result.new_ident)
16+
def test_nonexistent_arxiv_id_sets_deprecated_patch(self):
17+
ident = Statement(Snak('P818', ('X',)))
18+
result = ArxivItem.extract(ident)
19+
self.assertIsNotNone(result.patch)
20+
deprecated = [s for s in result.patch if s.rank == 'deprecated']
21+
self.assertTrue(deprecated)
22+
self.assertIs(deprecated[0].mainsnak, ident.mainsnak)
23+
self.assertFalse(hasattr(result, 'prior_ident'))
2124

2225
def test_nonexistent_doi_returns_empty(self):
2326
result = ArxivItem.extract(Statement(Snak('P356', ('10.99999/nonexistent',))))
24-
self.assertIsNone(result.prior_ident)
2527
self.assertIsNone(result.patch)
26-
self.assertIsNone(result.new_ident)
28+
self.assertFalse(hasattr(result, 'prior_ident'))

0 commit comments

Comments
 (0)