feat: recency ranking, noise filter, subdomain resolution

singularityjason · claude · singularityjason · commit 0eef4b9de7f9 · 2026-03-25T08:46:03.000+08:00
Query results now weighted by recency (30-day half-life) so recent
memories rank higher. Content noise filter rejects known phrases
(ok, thanks, got it), very short strings, and JSON blobs.

Vendor normalization now strips api/app/gateway subdomains and
paths — api.openai.com/v1 resolves to openai.com.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/lightning_memory/db.py b/lightning_memory/db.py
@@ -229,16 +229,25 @@ def query_memories(
     hit_ids = []
     for row in rows:
         hit_ids.append(row["id"])
+        bm25_score = -row["rank"]  # BM25 returns negative; negate for positive
+        # Recency decay: memories lose relevance over time
+        # Half-life of 30 days — a 60-day-old memory has 0.25x the weight
+        age_days = (now - row["created_at"]) / 86400
+        recency_weight = 0.5 ** (age_days / 30.0)
+        relevance = bm25_score * (0.3 + 0.7 * recency_weight)  # floor at 30% of original score
         results.append({
             "id": row["id"],
             "content": row["content"],
             "type": row["type"],
             "metadata": json.loads(row["metadata"]),
             "nostr_event_id": row["nostr_event_id"],
             "created_at": format_utc(row["created_at"]),
-            "relevance": -row["rank"],  # BM25 returns negative scores; negate for intuitive ordering
+            "relevance": round(relevance, 6),
         })
 
+    # Re-sort by recency-weighted relevance
+    results.sort(key=lambda r: r["relevance"], reverse=True)
+
     # Update access tracking for returned memories
     if hit_ids:
         _bump_access(conn, hit_ids, now)
diff --git a/lightning_memory/memory.py b/lightning_memory/memory.py
@@ -16,22 +16,31 @@
 def normalize_vendor(name: str) -> str:
     """Normalize a vendor name for consistent matching.
 
-    Strips protocol, www prefix, trailing slashes, and lowercases.
+    Strips protocol, www/api/app subdomains, trailing slashes, paths,
+    and lowercases. Reduces to the base domain so that api.openai.com,
+    www.openai.com, and openai.com all resolve to "openai.com".
+
     Examples:
         "https://www.Bitrefill.com/" -> "bitrefill.com"
-        "WWW.BITREFILL.COM" -> "bitrefill.com"
-        "bitrefill.com" -> "bitrefill.com"
+        "api.openai.com/v1" -> "openai.com"
+        "app.example.io" -> "example.io"
+        "bitrefill" -> "bitrefill"
     """
     v = name.strip().lower()
     # Strip protocol
     for prefix in ("https://", "http://"):
         if v.startswith(prefix):
             v = v[len(prefix):]
             break
-    # Strip www. prefix
-    if v.startswith("www."):
-        v = v[4:]
-    # Strip trailing slash
+    # Strip path (everything after first /)
+    if "/" in v:
+        v = v.split("/")[0]
+    # Strip common subdomains (www, api, app, gateway, gw)
+    for sub in ("www.", "api.", "app.", "gateway.", "gw."):
+        if v.startswith(sub):
+            v = v[len(sub):]
+            break
+    # Strip trailing slash (safety)
     v = v.rstrip("/")
     return v
 
@@ -69,6 +78,34 @@ def parse_since(since: str) -> float:
 _DEDUP_DEFAULT_THRESHOLD = 0.80
 
 
+# Noise patterns that should not be stored as memories
+_NOISE_PATTERNS = {
+    "ok", "okay", "got it", "thanks", "thank you", "sure", "yes", "no",
+    "understood", "ack", "acknowledged", "noted", "done", "fine", "right",
+}
+_MIN_CONTENT_LENGTH = 3  # memories shorter than this are noise
+
+
+def _is_noise(content: str) -> bool:
+    """Check if content is too short, a known noise phrase, or a JSON blob.
+
+    Returns True if the content should be rejected.
+    """
+    stripped = content.strip()
+    if len(stripped) < _MIN_CONTENT_LENGTH:
+        return True
+    if stripped.lower().rstrip(".!") in _NOISE_PATTERNS:
+        return True
+    # Reject JSON blobs (agents sometimes dump raw API responses)
+    if stripped.startswith("{") or stripped.startswith("["):
+        try:
+            json.loads(stripped)
+            return True  # Valid JSON blob — not a natural language memory
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return False
+
+
 def _jaccard(text_a: str, text_b: str, min_word_len: int = 3) -> float:
     """Compute Jaccard similarity on word sets (words >= min_word_len chars)."""
     words_a = {re.sub(r"[^\w]", "", w) for w in text_a.lower().split() if len(w) >= min_word_len}
@@ -103,6 +140,10 @@ def store(
 
         Types: general, transaction, vendor, preference, error, decision
         """
+        # Reject noise content
+        if _is_noise(content):
+            return {"status": "rejected", "reason": "noise", "content_preview": content[:50]}
+
         # Check for near-duplicate before storing
         existing = self._find_duplicate(content, memory_type, metadata)
         if existing is not None:
diff --git a/tests/test_memory_quality.py b/tests/test_memory_quality.py
@@ -167,6 +167,61 @@ def test_no_price_contradiction_for_small_changes(engine):
     assert "contradictions" not in r2 or len(r2.get("contradictions", [])) == 0
 
 
+# --- Noise filtering tests ---
+
+
+def test_noise_short_content_rejected(engine):
+    """Very short content should be rejected."""
+    result = engine.store("ok")
+    assert result.get("status") == "rejected"
+    assert result.get("reason") == "noise"
+
+
+def test_noise_known_phrase_rejected(engine):
+    """Known noise phrases should be rejected."""
+    for phrase in ["got it", "thanks", "sure", "acknowledged"]:
+        result = engine.store(phrase)
+        assert result.get("status") == "rejected", f"'{phrase}' should be rejected"
+
+
+def test_noise_json_blob_rejected(engine):
+    """JSON blobs should be rejected."""
+    result = engine.store('{"key": "value", "nested": {"a": [1,2,3]}}')
+    assert result.get("status") == "rejected"
+
+
+def test_noise_valid_content_accepted(engine):
+    """Normal memory content should pass the noise filter."""
+    result = engine.store("Paid 500 sats to bitrefill for a gift card")
+    assert "status" not in result or result.get("status") != "rejected"
+
+
+# --- Recency-weighted ranking tests ---
+
+
+def test_recent_memories_rank_higher(engine):
+    """Recent memories should rank higher than old ones for same query."""
+    import time as _time
+
+    # Store two memories with same keywords but different ages
+    engine.store("bitrefill vendor is reliable for gift cards", memory_type="vendor")
+
+    # Artificially age the first memory by updating created_at
+    engine.conn.execute(
+        "UPDATE memories SET created_at = created_at - 5184000"  # 60 days ago
+    )
+    engine.conn.commit()
+
+    # Store a newer memory
+    engine.store("bitrefill vendor has new pricing for gift cards", memory_type="vendor")
+
+    results = engine.query("bitrefill vendor gift cards")
+    assert len(results) >= 2
+    # The newer memory should have higher relevance
+    relevances = [r["relevance"] for r in results]
+    assert relevances[0] >= relevances[1], "Newer memory should rank first"
+
+
 def test_contradiction_includes_existing_preview(engine):
     """Contradiction should include a preview of the conflicting memory."""
     engine.store(
diff --git a/tests/test_vendor_normalization.py b/tests/test_vendor_normalization.py
@@ -45,6 +45,25 @@ def test_normalize_already_clean():
     assert normalize_vendor("openai") == "openai"
 
 
+def test_normalize_subdomain_api():
+    assert normalize_vendor("api.openai.com") == "openai.com"
+    assert normalize_vendor("https://api.openai.com/v1") == "openai.com"
+
+
+def test_normalize_subdomain_app():
+    assert normalize_vendor("app.example.io") == "example.io"
+
+
+def test_normalize_subdomain_gateway():
+    assert normalize_vendor("gateway.bitrefill.com") == "bitrefill.com"
+    assert normalize_vendor("gw.bitrefill.com") == "bitrefill.com"
+
+
+def test_normalize_strips_path():
+    assert normalize_vendor("bitrefill.com/api/v1") == "bitrefill.com"
+    assert normalize_vendor("https://api.openai.com/v1/chat") == "openai.com"
+
+
 def test_normalize_empty():
     assert normalize_vendor("") == ""