Skip to content

Commit 3f8b15a

Browse files
ghuronclaude
andcommitted
Add select_outdated and rank_by_recency to Statement
Port delete_all_but_one and deprecate_all_but_one from legacy Claim, with improved naming and unit tests. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 3495315 commit 3f8b15a

2 files changed

Lines changed: 258 additions & 1 deletion

File tree

src/wdpy/statement.py

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,4 +174,85 @@ def _display_name(s: Statement) -> str:
174174

175175
def save(self, summary: str) -> Optional[Dict[str, Any]]:
176176
"""Persist statement via wbsetclaim."""
177-
return api_write('wbsetclaim', claim=self.json(), summary=summary)
177+
return api_write('wbsetclaim', claim=self.json(), summary=summary)
178+
179+
@staticmethod
180+
def select_outdated(statements: List[Statement],
181+
group_by: Optional[str] = None) -> List[Statement]:
182+
"""Return statements to delete, keeping only the most recently sourced per group.
183+
184+
Statements are grouped by the value of the group_by qualifier (or treated as
185+
one group when group_by is None). Within each group the keeper is the
186+
novalue/somevalue statement if one exists, otherwise the statement with the
187+
highest reference publication date. Statements lacking the group_by qualifier
188+
are always returned for deletion.
189+
"""
190+
_MISSING = object()
191+
192+
def _group(s: Statement):
193+
if not group_by:
194+
return None
195+
for q in (s.qualifiers or []):
196+
if q.property == group_by and q.value:
197+
return q.value[0]
198+
return _MISSING
199+
200+
# First pass: find the best ref date per group (None = novalue/somevalue wins).
201+
best: Dict[Any, Optional[str]] = {}
202+
for s in statements:
203+
if (g := _group(s)) is _MISSING:
204+
continue
205+
if s.mainsnak.snaktype != 'value':
206+
best[g] = None
207+
elif g not in best:
208+
best[g] = (s.references and s.references.publication_date) or '00000000'
209+
elif best[g] is not None:
210+
d = (s.references and s.references.publication_date) or '00000000'
211+
if d > best[g]:
212+
best[g] = d
213+
214+
# Second pass: keep exactly one per group, queue the rest for deletion.
215+
kept: set = set()
216+
to_delete: List[Statement] = []
217+
for s in statements:
218+
if (g := _group(s)) is _MISSING:
219+
to_delete.append(s)
220+
continue
221+
if best.get(g) is None: # novalue group
222+
if s.mainsnak.snaktype == 'value' or g in kept:
223+
to_delete.append(s)
224+
else:
225+
kept.add(g)
226+
elif g in kept or ((s.references and s.references.publication_date) or '00000000') < best[g]:
227+
to_delete.append(s)
228+
else:
229+
kept.add(g)
230+
return to_delete
231+
232+
@staticmethod
233+
def rank_by_recency(statements: List[Statement]) -> None:
234+
"""Assign deprecated rank to all but the most recently sourced statement.
235+
236+
Statements carrying a P2241 (reason for deprecation) qualifier are left
237+
unchanged. If any statement already has preferred rank, nothing is modified.
238+
The one statement with the highest reference publication date keeps normal
239+
rank; all others receive deprecated rank.
240+
"""
241+
for s in statements:
242+
if s.rank == 'preferred':
243+
return
244+
245+
best_date = '00000000'
246+
for s in statements:
247+
if not any(q.property == 'P2241' for q in (s.qualifiers or [])):
248+
if (d := (s.references and s.references.publication_date) or '00000000') > best_date:
249+
best_date = d
250+
251+
remaining = 1
252+
for s in statements:
253+
if not any(q.property == 'P2241' for q in (s.qualifiers or [])):
254+
if remaining > 0 and ((s.references and s.references.publication_date) or '00000000') == best_date:
255+
s.rank = 'normal'
256+
remaining -= 1
257+
else:
258+
s.rank = 'deprecated'

tests/test_statement.py

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
from unittest import TestCase, mock
33
from wdpy import Snak, Statement, References
4+
from wdpy.references import _PUB_DATES, _REDIRECTS
45

56

67
# ── helpers ──────────────────────────────────────────────────────────────────
@@ -292,3 +293,178 @@ def test_mixed_ordinals_only_duplicates_deleted(self):
292293
b2 = p50('Q99', '2')
293294
result = self.dedup(a1, a1_dup, b2)
294295
self.assertEqual(result, [a1_dup])
296+
297+
298+
# ── helpers for date-based tests ──────────────────────────────────────────────
299+
300+
def _dated_refs(date: int) -> References:
301+
"""References with a single P248 snak whose publication date is `date`."""
302+
qid = f'Q{date}'
303+
_PUB_DATES[qid] = date
304+
return References([{'snaks': {'P248': [Snak('P248', (qid,))]}}])
305+
306+
307+
class SelectOutdated(TestCase):
308+
GROUP = 'P1545'
309+
310+
def setUp(self):
311+
_PUB_DATES.clear()
312+
_REDIRECTS.clear()
313+
314+
def _s(self, val='Q1', date=None):
315+
refs = _dated_refs(date) if date is not None else None
316+
return Statement(Snak('P31', (val,)), references=refs)
317+
318+
def _sg(self, val='Q1', group='1', date=None):
319+
s = self._s(val, date)
320+
s.qualifiers = [Snak(self.GROUP, (group,))]
321+
return s
322+
323+
def _nv(self, group=None):
324+
quals = [Snak(self.GROUP, (group,))] if group else None
325+
return Statement(Snak('P31', None, 'novalue'), qualifiers=quals)
326+
327+
# ── no group_by ───────────────────────────────────────────────────────────
328+
329+
def test_empty(self):
330+
self.assertEqual(Statement.select_outdated([]), [])
331+
332+
def test_single(self):
333+
self.assertEqual(Statement.select_outdated([self._s()]), [])
334+
335+
def test_older_deleted(self):
336+
old = self._s('Q1', 20200101)
337+
new = self._s('Q2', 20220101)
338+
self.assertEqual(Statement.select_outdated([new, old]), [old])
339+
340+
def test_older_first_still_deleted(self):
341+
old = self._s('Q1', 20200101)
342+
new = self._s('Q2', 20220101)
343+
self.assertEqual(Statement.select_outdated([old, new]), [old])
344+
345+
def test_no_refs_deleted_when_other_is_dated(self):
346+
no_date = self._s('Q1')
347+
dated = self._s('Q2', 20220101)
348+
self.assertEqual(Statement.select_outdated([no_date, dated]), [no_date])
349+
350+
def test_equal_dates_first_kept_second_deleted(self):
351+
a = self._s('Q1', 20220101)
352+
b = self._s('Q2', 20220101)
353+
self.assertEqual(Statement.select_outdated([a, b]), [b])
354+
355+
def test_novalue_beats_value(self):
356+
val = self._s('Q1', 20220101)
357+
nv = self._nv()
358+
result = Statement.select_outdated([val, nv])
359+
self.assertIn(val, result)
360+
self.assertNotIn(nv, result)
361+
362+
def test_two_novalue_second_deleted(self):
363+
nv1, nv2 = self._nv(), self._nv()
364+
self.assertEqual(Statement.select_outdated([nv1, nv2]), [nv2])
365+
366+
# ── with group_by ─────────────────────────────────────────────────────────
367+
368+
def test_missing_qualifier_always_deleted(self):
369+
s = self._s() # no P1545
370+
self.assertEqual(Statement.select_outdated([s], group_by=self.GROUP), [s])
371+
372+
def test_groups_managed_independently(self):
373+
a_old = self._sg('Q1', '1', 20200101)
374+
a_new = self._sg('Q2', '1', 20220101)
375+
b_old = self._sg('Q3', '2', 20190101)
376+
b_new = self._sg('Q4', '2', 20210101)
377+
result = Statement.select_outdated([a_old, a_new, b_old, b_new],
378+
group_by=self.GROUP)
379+
self.assertIn(a_old, result)
380+
self.assertIn(b_old, result)
381+
self.assertNotIn(a_new, result)
382+
self.assertNotIn(b_new, result)
383+
384+
def test_unqualified_deleted_alongside_group_outdated(self):
385+
no_qual = self._s()
386+
old = self._sg('Q1', '1', 20200101)
387+
new = self._sg('Q2', '1', 20220101)
388+
result = Statement.select_outdated([no_qual, old, new], group_by=self.GROUP)
389+
self.assertIn(no_qual, result)
390+
self.assertIn(old, result)
391+
self.assertNotIn(new, result)
392+
393+
def test_novalue_in_group_beats_value(self):
394+
val = self._sg('Q1', '1', 20220101)
395+
nv = self._nv(group='1')
396+
result = Statement.select_outdated([val, nv], group_by=self.GROUP)
397+
self.assertIn(val, result)
398+
self.assertNotIn(nv, result)
399+
400+
401+
class RankByRecency(TestCase):
402+
403+
def setUp(self):
404+
_PUB_DATES.clear()
405+
_REDIRECTS.clear()
406+
407+
def _s(self, date=None, *, rank=None, p2241=False):
408+
refs = _dated_refs(date) if date is not None else None
409+
quals = [Snak('P2241', ('Q1',))] if p2241 else None
410+
return Statement(Snak('P31', ('Q1',)), rank=rank, qualifiers=quals, references=refs)
411+
412+
# ── no-op cases ───────────────────────────────────────────────────────────
413+
414+
def test_empty(self):
415+
Statement.rank_by_recency([]) # must not raise
416+
417+
def test_preferred_halts_all_changes(self):
418+
preferred = self._s(20220101, rank='preferred')
419+
old = self._s(20200101)
420+
Statement.rank_by_recency([old, preferred])
421+
self.assertIsNone(old.rank)
422+
423+
def test_p2241_statement_not_modified(self):
424+
s = self._s(p2241=True)
425+
Statement.rank_by_recency([s])
426+
self.assertIsNone(s.rank)
427+
428+
# ── rank assignment ───────────────────────────────────────────────────────
429+
430+
def test_single_no_date_becomes_normal(self):
431+
s = self._s()
432+
Statement.rank_by_recency([s])
433+
self.assertEqual(s.rank, 'normal')
434+
435+
def test_newer_normal_older_deprecated(self):
436+
new = self._s(20220101)
437+
old = self._s(20200101)
438+
Statement.rank_by_recency([new, old])
439+
self.assertEqual(new.rank, 'normal')
440+
self.assertEqual(old.rank, 'deprecated')
441+
442+
def test_order_independent(self):
443+
new = self._s(20220101)
444+
old = self._s(20200101)
445+
Statement.rank_by_recency([old, new])
446+
self.assertEqual(new.rank, 'normal')
447+
self.assertEqual(old.rank, 'deprecated')
448+
449+
def test_no_refs_deprecated_when_other_is_dated(self):
450+
dated = self._s(20220101)
451+
undated = self._s()
452+
Statement.rank_by_recency([dated, undated])
453+
self.assertEqual(dated.rank, 'normal')
454+
self.assertEqual(undated.rank, 'deprecated')
455+
456+
def test_equal_dates_first_normal_second_deprecated(self):
457+
a = self._s(20220101)
458+
b = self._s(20220101)
459+
Statement.rank_by_recency([a, b])
460+
self.assertEqual(a.rank, 'normal')
461+
self.assertEqual(b.rank, 'deprecated')
462+
463+
def test_p2241_excluded_others_ranked(self):
464+
with_reason = self._s(20230101, p2241=True)
465+
new = self._s(20220101)
466+
old = self._s(20200101)
467+
Statement.rank_by_recency([with_reason, new, old])
468+
self.assertIsNone(with_reason.rank)
469+
self.assertEqual(new.rank, 'normal')
470+
self.assertEqual(old.rank, 'deprecated')

0 commit comments

Comments
 (0)