|
16 | 16 | def normalize_vendor(name: str) -> str: |
17 | 17 | """Normalize a vendor name for consistent matching. |
18 | 18 |
|
19 | | - Strips protocol, www prefix, trailing slashes, and lowercases. |
| 19 | + Strips protocol, www/api/app subdomains, trailing slashes, paths, |
| 20 | + and lowercases. Reduces to the base domain so that api.openai.com, |
| 21 | + www.openai.com, and openai.com all resolve to "openai.com". |
| 22 | +
|
20 | 23 | Examples: |
21 | 24 | "https://www.Bitrefill.com/" -> "bitrefill.com" |
22 | | - "WWW.BITREFILL.COM" -> "bitrefill.com" |
23 | | - "bitrefill.com" -> "bitrefill.com" |
| 25 | + "api.openai.com/v1" -> "openai.com" |
| 26 | + "app.example.io" -> "example.io" |
| 27 | + "bitrefill" -> "bitrefill" |
24 | 28 | """ |
25 | 29 | v = name.strip().lower() |
26 | 30 | # Strip protocol |
27 | 31 | for prefix in ("https://", "http://"): |
28 | 32 | if v.startswith(prefix): |
29 | 33 | v = v[len(prefix):] |
30 | 34 | break |
31 | | - # Strip www. prefix |
32 | | - if v.startswith("www."): |
33 | | - v = v[4:] |
34 | | - # Strip trailing slash |
| 35 | + # Strip path (everything after first /) |
| 36 | + if "/" in v: |
| 37 | + v = v.split("/")[0] |
| 38 | + # Strip common subdomains (www, api, app, gateway, gw) |
| 39 | + for sub in ("www.", "api.", "app.", "gateway.", "gw."): |
| 40 | + if v.startswith(sub): |
| 41 | + v = v[len(sub):] |
| 42 | + break |
| 43 | + # Strip trailing slash (safety) |
35 | 44 | v = v.rstrip("/") |
36 | 45 | return v |
37 | 46 |
|
@@ -69,6 +78,34 @@ def parse_since(since: str) -> float: |
69 | 78 | _DEDUP_DEFAULT_THRESHOLD = 0.80 |
70 | 79 |
|
71 | 80 |
|
| 81 | +# Noise patterns that should not be stored as memories |
| 82 | +_NOISE_PATTERNS = { |
| 83 | + "ok", "okay", "got it", "thanks", "thank you", "sure", "yes", "no", |
| 84 | + "understood", "ack", "acknowledged", "noted", "done", "fine", "right", |
| 85 | +} |
| 86 | +_MIN_CONTENT_LENGTH = 3 # memories shorter than this are noise |
| 87 | + |
| 88 | + |
| 89 | +def _is_noise(content: str) -> bool: |
| 90 | + """Check if content is too short, a known noise phrase, or a JSON blob. |
| 91 | +
|
| 92 | + Returns True if the content should be rejected. |
| 93 | + """ |
| 94 | + stripped = content.strip() |
| 95 | + if len(stripped) < _MIN_CONTENT_LENGTH: |
| 96 | + return True |
| 97 | + if stripped.lower().rstrip(".!") in _NOISE_PATTERNS: |
| 98 | + return True |
| 99 | + # Reject JSON blobs (agents sometimes dump raw API responses) |
| 100 | + if stripped.startswith("{") or stripped.startswith("["): |
| 101 | + try: |
| 102 | + json.loads(stripped) |
| 103 | + return True # Valid JSON blob — not a natural language memory |
| 104 | + except (json.JSONDecodeError, ValueError): |
| 105 | + pass |
| 106 | + return False |
| 107 | + |
| 108 | + |
72 | 109 | def _jaccard(text_a: str, text_b: str, min_word_len: int = 3) -> float: |
73 | 110 | """Compute Jaccard similarity on word sets (words >= min_word_len chars).""" |
74 | 111 | words_a = {re.sub(r"[^\w]", "", w) for w in text_a.lower().split() if len(w) >= min_word_len} |
@@ -103,6 +140,10 @@ def store( |
103 | 140 |
|
104 | 141 | Types: general, transaction, vendor, preference, error, decision |
105 | 142 | """ |
| 143 | + # Reject noise content |
| 144 | + if _is_noise(content): |
| 145 | + return {"status": "rejected", "reason": "noise", "content_preview": content[:50]} |
| 146 | + |
106 | 147 | # Check for near-duplicate before storing |
107 | 148 | existing = self._find_duplicate(content, memory_type, metadata) |
108 | 149 | if existing is not None: |
|
0 commit comments