Skip to content

Commit 5084aff

Browse files
committed
Port some logic from previous version
1 parent 9c55425 commit 5084aff

2 files changed

Lines changed: 237 additions & 9 deletions

File tree

src/wdpy/source_item.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import sys
66
from dataclasses import dataclass
77
from pathlib import Path
8-
from typing import Any, Dict, List, Optional
8+
from typing import Any, ClassVar, Dict, List, Optional
99
import urllib.error
1010
from urllib.request import Request, build_opener
1111

@@ -30,6 +30,8 @@ def request(url: str,
3030
@dataclass
3131
class SourceItem:
3232
patch: Optional[List[Statement]] = None
33+
proposed_label: Optional[str] = None
34+
_lookup_cache: ClassVar[Dict[str, Dict[Any, Optional[str]]]] = {}
3335

3436
def __init_subclass__(cls, **kwargs:Any):
3537
super().__init_subclass__(**kwargs)
@@ -53,11 +55,15 @@ def get_db_ref(cls) -> Optional[str]:
5355
return cls._config.get('source_item')
5456

5557
@staticmethod
56-
def lookup (property_id:str, external_id:Any) -> Optional[str]:
57-
if result := haswbstatement(property_id, external_id):
58-
if len(result) > 1: # Todo: warning ONLY if it was pre-loaded
59-
logging.warning(f'{len(result)} instances {property_id}="{external_id}", using {result[0]}')
60-
return result[0]
58+
def lookup(property_id: str, external_id: Any) -> Optional[str]:
59+
by_prop = SourceItem._lookup_cache.setdefault(property_id, {})
60+
if external_id not in by_prop:
61+
by_prop[external_id] = None
62+
if result := haswbstatement(property_id, external_id):
63+
if len(result) > 1: # Todo: warning ONLY if it was pre-loaded
64+
logging.warning(f'{len(result)} instances {property_id}="{external_id}", using {result[0]}')
65+
by_prop[external_id] = result[0]
66+
return by_prop[external_id]
6167

6268
@classmethod
6369
def make_request(cls, ident: Statement) -> Optional[Request]:
@@ -73,6 +79,8 @@ def add_claim(self, property_id: str, *value: str) -> Statement:
7379
if self.patch is None:
7480
self.patch = []
7581
self.patch.append(s)
82+
if property_id == 'P1476' and not self.proposed_label and value:
83+
self.proposed_label = value[0]
7684
return s
7785

7886
def obtain(self, source: dict, properties: dict, translate: dict = {}) -> None:
@@ -98,6 +106,26 @@ def add_author(self, full_name: str, orcid: Optional[str] = None) -> Statement:
98106
s.set_qualifier('P1545', str(self._author_num))
99107
return s
100108

109+
@staticmethod
110+
def register_new_item(statements: List[Statement]) -> None:
111+
for s in statements:
112+
if Snak.type_of(s.mainsnak.property) != 'external-id':
113+
continue
114+
if not s.id or not s.mainsnak.value:
115+
continue
116+
property_id = s.mainsnak.property
117+
value = s.mainsnak.value[0]
118+
qid = s.id.split('$')[0]
119+
if property_id not in SourceItem._lookup_cache:
120+
logging.error('No lookup performed for %s', property_id)
121+
SourceItem._lookup_cache[property_id] = {}
122+
elif value not in SourceItem._lookup_cache[property_id]:
123+
logging.error('No lookup performed for %s:%s', property_id, value)
124+
elif SourceItem._lookup_cache[property_id][value] is not None:
125+
logging.error('Duplicate discovered for %s:%s %s+%s',
126+
property_id, value, qid, SourceItem._lookup_cache[property_id][value])
127+
SourceItem._lookup_cache[property_id][value] = qid
128+
101129
def parse(self, text: str) -> None:
102130
raise NotImplementedError('Subclasses must implement parse')
103131

tests/test_source_item.py

Lines changed: 203 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from unittest import TestCase, mock
44

55
import wdpy
6-
from wdpy import SourceItem
6+
from wdpy import SourceItem, Statement, Snak
77

88

99
class TestRequest(TestCase):
@@ -69,10 +69,210 @@ class Derived(Base):
6969
def test_no_child_json(self):
7070
class Base(SourceItem):
7171
_config = {'a': 1}
72-
72+
7373
with mock.patch.object(SourceItem, '_load_config', return_value={}):
7474
class Derived(Base):
7575
pass
76-
76+
7777
self.assertEqual(Derived._config, {'a': 1})
7878
self.assertIsNot(Derived._config, Base._config)
79+
80+
81+
class TestLookup(TestCase):
82+
def setUp(self):
83+
SourceItem._lookup_cache.clear()
84+
85+
@mock.patch('wdpy.source_item.haswbstatement')
86+
def test_hit_returns_qid(self, mock_hws):
87+
mock_hws.return_value = ['Q42']
88+
self.assertEqual(SourceItem.lookup('P31', 'some-id'), 'Q42')
89+
90+
@mock.patch('wdpy.source_item.haswbstatement')
91+
def test_miss_returns_none(self, mock_hws):
92+
mock_hws.return_value = None
93+
self.assertIsNone(SourceItem.lookup('P31', 'unknown'))
94+
95+
@mock.patch('wdpy.source_item.haswbstatement')
96+
def test_second_call_uses_cache(self, mock_hws):
97+
mock_hws.return_value = ['Q42']
98+
SourceItem.lookup('P31', 'some-id')
99+
SourceItem.lookup('P31', 'some-id')
100+
mock_hws.assert_called_once()
101+
102+
@mock.patch('wdpy.source_item.haswbstatement')
103+
def test_negative_result_cached(self, mock_hws):
104+
mock_hws.return_value = None
105+
SourceItem.lookup('P31', 'unknown')
106+
SourceItem.lookup('P31', 'unknown')
107+
mock_hws.assert_called_once()
108+
109+
@mock.patch('wdpy.source_item.haswbstatement')
110+
def test_different_ids_queried_separately(self, mock_hws):
111+
mock_hws.side_effect = [['Q1'], ['Q2']]
112+
self.assertEqual(SourceItem.lookup('P31', 'id-1'), 'Q1')
113+
self.assertEqual(SourceItem.lookup('P31', 'id-2'), 'Q2')
114+
self.assertEqual(mock_hws.call_count, 2)
115+
116+
@mock.patch('wdpy.source_item.haswbstatement')
117+
def test_different_properties_queried_separately(self, mock_hws):
118+
mock_hws.side_effect = [['Q1'], ['Q2']]
119+
SourceItem.lookup('P31', 'same-id')
120+
SourceItem.lookup('P496', 'same-id')
121+
self.assertEqual(mock_hws.call_count, 2)
122+
123+
@mock.patch('wdpy.source_item.haswbstatement')
124+
def test_multiple_results_warns_and_returns_first(self, mock_hws):
125+
mock_hws.return_value = ['Q1', 'Q2', 'Q3']
126+
with self.assertLogs('root', level='WARNING') as cm:
127+
result = SourceItem.lookup('P31', 'dup-id')
128+
self.assertEqual(result, 'Q1')
129+
self.assertTrue(any('3 instances' in line for line in cm.output))
130+
131+
132+
def _ext_id_statement(property_id: str, value: str, qid: str) -> Statement:
133+
"""Build a minimal external-id Statement with a GUID-style id."""
134+
return Statement(mainsnak=Snak(property_id, (value,)), id=f'{qid}$abc-def')
135+
136+
137+
class TestRegisterNewItem(TestCase):
138+
def setUp(self):
139+
SourceItem._lookup_cache.clear()
140+
141+
def _register(self, statements, expected_qid=None, expected_value=None,
142+
expected_property=None):
143+
"""Helper: patch Snak.type_of to return 'external-id' and call register."""
144+
with mock.patch.object(Snak, 'type_of', return_value='external-id'):
145+
SourceItem.register_new_item(statements)
146+
147+
# --- happy path ---
148+
149+
def test_registers_qid_after_lookup_miss(self):
150+
"""Normal flow: lookup recorded None, item created, cache updated."""
151+
SourceItem._lookup_cache['P496'] = {'0000-0001': None}
152+
s = _ext_id_statement('P496', '0000-0001', 'Q42')
153+
with mock.patch.object(Snak, 'type_of', return_value='external-id'):
154+
SourceItem.register_new_item([s])
155+
self.assertEqual(SourceItem._lookup_cache['P496']['0000-0001'], 'Q42')
156+
157+
def test_qid_extracted_from_statement_id(self):
158+
"""QID is the part of statement.id before '$'."""
159+
SourceItem._lookup_cache['P496'] = {'orcid-1': None}
160+
s = Statement(mainsnak=Snak('P496', ('orcid-1',)), id='Q999$some-guid')
161+
with mock.patch.object(Snak, 'type_of', return_value='external-id'):
162+
SourceItem.register_new_item([s])
163+
self.assertEqual(SourceItem._lookup_cache['P496']['orcid-1'], 'Q999')
164+
165+
def test_multiple_external_id_statements(self):
166+
"""All external-id statements in the list are registered."""
167+
SourceItem._lookup_cache['P496'] = {'id-a': None}
168+
SourceItem._lookup_cache['P213'] = {'isni-b': None}
169+
statements = [
170+
_ext_id_statement('P496', 'id-a', 'Q10'),
171+
_ext_id_statement('P213', 'isni-b', 'Q10'),
172+
]
173+
with mock.patch.object(Snak, 'type_of', return_value='external-id'):
174+
SourceItem.register_new_item(statements)
175+
self.assertEqual(SourceItem._lookup_cache['P496']['id-a'], 'Q10')
176+
self.assertEqual(SourceItem._lookup_cache['P213']['isni-b'], 'Q10')
177+
178+
# --- error conditions ---
179+
180+
def test_property_not_in_cache_logs_error_and_registers(self):
181+
"""Property never looked up: logs error but still writes to cache."""
182+
s = _ext_id_statement('P496', 'new-id', 'Q7')
183+
with mock.patch.object(Snak, 'type_of', return_value='external-id'):
184+
with self.assertLogs('root', level='ERROR') as cm:
185+
SourceItem.register_new_item([s])
186+
self.assertTrue(any('No lookup performed for P496' in line for line in cm.output))
187+
self.assertEqual(SourceItem._lookup_cache['P496']['new-id'], 'Q7')
188+
189+
def test_value_not_in_cache_logs_error_and_registers(self):
190+
"""Property in cache but value never looked up: logs error, writes to cache."""
191+
SourceItem._lookup_cache['P496'] = {} # property known, but value absent
192+
s = _ext_id_statement('P496', 'unseen-id', 'Q8')
193+
with mock.patch.object(Snak, 'type_of', return_value='external-id'):
194+
with self.assertLogs('root', level='ERROR') as cm:
195+
SourceItem.register_new_item([s])
196+
self.assertTrue(any('No lookup performed for P496:unseen-id' in line for line in cm.output))
197+
self.assertEqual(SourceItem._lookup_cache['P496']['unseen-id'], 'Q8')
198+
199+
def test_duplicate_logs_error_and_overwrites(self):
200+
"""Cached value already non-None: logs duplicate error, still updates cache."""
201+
SourceItem._lookup_cache['P496'] = {'dup-id': 'Q1'}
202+
s = _ext_id_statement('P496', 'dup-id', 'Q2')
203+
with mock.patch.object(Snak, 'type_of', return_value='external-id'):
204+
with self.assertLogs('root', level='ERROR') as cm:
205+
SourceItem.register_new_item([s])
206+
self.assertTrue(any('Duplicate discovered' in line for line in cm.output))
207+
self.assertEqual(SourceItem._lookup_cache['P496']['dup-id'], 'Q2')
208+
209+
# --- filtering ---
210+
211+
def test_non_external_id_statement_is_skipped(self):
212+
"""Statements whose property type is not external-id are ignored."""
213+
s = Statement(mainsnak=Snak('P31', ('Q5',)), id='Q42$abc')
214+
with mock.patch.object(Snak, 'type_of', return_value='wikibase-item'):
215+
SourceItem.register_new_item([s])
216+
self.assertEqual(SourceItem._lookup_cache, {})
217+
218+
def test_statement_without_id_is_skipped(self):
219+
"""Statement with no id (not yet saved) is silently ignored."""
220+
s = Statement(mainsnak=Snak('P496', ('some-val',)), id=None)
221+
with mock.patch.object(Snak, 'type_of', return_value='external-id'):
222+
SourceItem.register_new_item([s])
223+
self.assertEqual(SourceItem._lookup_cache, {})
224+
225+
def test_statement_without_value_is_skipped(self):
226+
"""Snak with no value (novalue/somevalue) is silently ignored."""
227+
s = Statement(mainsnak=Snak('P496', None), id='Q42$abc')
228+
with mock.patch.object(Snak, 'type_of', return_value='external-id'):
229+
SourceItem.register_new_item([s])
230+
self.assertEqual(SourceItem._lookup_cache, {})
231+
232+
# --- interaction with lookup() ---
233+
234+
def test_register_makes_subsequent_lookup_use_cache(self):
235+
"""After registering, lookup() returns cached QID without calling haswbstatement."""
236+
SourceItem._lookup_cache['P496'] = {'0000-0002': None}
237+
s = _ext_id_statement('P496', '0000-0002', 'Q55')
238+
with mock.patch.object(Snak, 'type_of', return_value='external-id'):
239+
SourceItem.register_new_item([s])
240+
with mock.patch('wdpy.source_item.haswbstatement') as mock_hws:
241+
result = SourceItem.lookup('P496', '0000-0002')
242+
self.assertEqual(result, 'Q55')
243+
mock_hws.assert_not_called()
244+
245+
246+
class TestProposedLabel(TestCase):
247+
def _item(self):
248+
return SourceItem()
249+
250+
def test_proposed_label_initially_none(self):
251+
self.assertIsNone(self._item().proposed_label)
252+
253+
def test_p1476_sets_proposed_label(self):
254+
item = self._item()
255+
item.add_claim('P1476', 'Some Title', 'en')
256+
self.assertEqual(item.proposed_label, 'Some Title')
257+
258+
def test_first_p1476_wins(self):
259+
item = self._item()
260+
item.add_claim('P1476', 'First Title', 'en')
261+
item.add_claim('P1476', 'Second Title', 'fr')
262+
self.assertEqual(item.proposed_label, 'First Title')
263+
264+
def test_other_property_does_not_set_label(self):
265+
item = self._item()
266+
item.add_claim('P31', 'Q5')
267+
self.assertIsNone(item.proposed_label)
268+
269+
def test_p1476_without_value_does_not_set_label(self):
270+
item = self._item()
271+
item.add_claim('P1476')
272+
self.assertIsNone(item.proposed_label)
273+
274+
def test_p1476_claim_still_added_to_patch(self):
275+
item = self._item()
276+
s = item.add_claim('P1476', 'A Title', 'en')
277+
self.assertIn(s, item.patch)
278+
self.assertEqual(item.patch[0].mainsnak.property, 'P1476')

0 commit comments

Comments
 (0)