Merge pull request #32 from EfficientContext/locomo-retest

SecretSettler · web-flow · commit 554834903507 · 2026-03-07T00:22:53.000Z
Locomo retest with 5x10
diff --git a/docs/guides/mem0.md b/docs/guides/mem0.md
@@ -54,18 +54,21 @@ python examples/mem0_locomo_example.py
 | `LOCOMO_MAX_QA` | `150` | Max QA pairs to evaluate |
 | `LOCOMO_MAX_TOKENS` | `32` | Max generation tokens |
 | `LOCOMO_NUM_TURNS` | `150` | Multi-turn conversation length |
-| `LOCOMO_TOP_K_LIST` | `20,100` | Comma-separated top-k values to benchmark |
+| `LOCOMO_TOP_K_LIST` | `20,5x10` | Top-k values to benchmark. Use `N` for standard top-k (e.g. `20`), or `NxM` to retrieve top-N and repeat each M times to simulate long context (e.g. `5x10` retrieves 5 memories, repeats 10x → 50 total context blocks) |
 
 ## Results
 
-LoCoMo conv 0, 102 memories, 150 turns:
+Aggregate across all 10 LoCoMo conversations, Qwen2.5-7B-Instruct on 2xA6000 (SGLang, tp=2):
+
+| k | mode | ttft | ttft delta | judge |
+|---|---|---|---|---|
+| 20 | baseline | 0.0566s | - | 0.428 |
+| 20 | reorder | 0.0539s | +4.8% | 0.431 |
+| 100 | baseline | 0.1012s | - | 0.437 |
+| 100 | reorder | 0.0554s | **+45.3%** | 0.420 |
+| 5x10 | baseline | 0.1051s | - | 0.418 |
+| 5x10 | reorder | 0.0548s | **+47.8%** | 0.414 |
 
-| k | mode | ttft | judge |
-|---|---|---|---|
-| 20 | baseline | 0.0377s | 0.440 |
-| 20 | reorder | 0.0315s | 0.460 |
-| 100 | baseline | 0.1012s | 0.437 |
-| 100 | reorder | 0.0554s | 0.420 |
 
 ## General usage
 
diff --git a/examples/mem0_locomo_example.py b/examples/mem0_locomo_example.py
@@ -13,11 +13,11 @@
 LOCOMO_URL = "https://raw.githubusercontent.com/snap-research/locomo/main/data/locomo10.json"
 LOCOMO_CACHE = Path(__file__).resolve().parent.parent / "tests" / ".locomo_cache" / "locomo10.json"
 
-CONV_INDEX = int(os.environ.get("LOCOMO_CONV_INDEX", "0"))
+CONV_INDEX = os.environ.get("LOCOMO_CONV_INDEX", "all")
 MAX_QA = int(os.environ.get("LOCOMO_MAX_QA", "150"))
 MAX_GEN = int(os.environ.get("LOCOMO_MAX_TOKENS", "32"))
 NUM_TURNS = int(os.environ.get("LOCOMO_NUM_TURNS", "150"))
-TOP_K_LIST = os.environ.get("LOCOMO_TOP_K_LIST", "20,100")
+TOP_K_LIST = os.environ.get("LOCOMO_TOP_K_LIST", "20,50,5x10")
 
 
 async def _stream_ttft(prompt, model, max_tokens=512, request_id=None):
@@ -56,10 +56,38 @@ def run_ttft(prompt, model, max_tokens=512, request_id=None):
     return asyncio.run(_stream_ttft(prompt, model, max_tokens, request_id))
 
 
-def build_prompt(question, context_str):
-    return (f"Memories:\n{context_str}\n"
-            f"Based on the memories above, concisely answer the following "
-            f"question in as few words as possible.\nQuestion: {question}\nAnswer:")
+def build_prompt(question, context_str, importance_ranking=None):
+    prompt = (f"Memories:\n{context_str}\n"
+              f"Based on the memories above, concisely answer the following "
+              f"question in as few words as possible.\n")
+    if importance_ranking:
+        prompt += (f"Please read the documents in the following importance ranking:\n"
+                   f"{importance_ranking}\n"
+                   f"Prioritize information from higher-ranked documents.\n")
+    prompt += f"Question: {question}\nAnswer:"
+    return prompt
+
+
+def build_importance_ranking(original_ids, reordered_ids):
+    """Map original retrieval order to positions in the reordered doc list.
+
+    With repeated docs the same doc_id appears multiple times, so we track
+    the *first* occurrence of each unique doc in the original order and map
+    it to its first position in the reordered list.
+    """
+    # First occurrence of each doc in reordered list -> its [Doc_N] position
+    pos = {}
+    for i, did in enumerate(reordered_ids):
+        if did not in pos:
+            pos[did] = i + 1
+    # Deduplicate original_ids while preserving order
+    seen = set()
+    unique_original = []
+    for did in original_ids:
+        if did not in seen:
+            seen.add(did)
+            unique_original.append(did)
+    return " > ".join(f"[Doc_{pos[did]}]" for did in unique_original if did in pos)
 
 
 def llm_judge(question, prediction, ground_truth):
@@ -120,18 +148,20 @@ def strip_thinking(text):
 
 def build_context_str(doc_ids, corpus_map):
     parts = []
-    for did in doc_ids:
+    for i, did in enumerate(doc_ids):
         entry = corpus_map.get(str(did), {})
         text = entry.get("text", entry.get("content", f"[doc {did}]"))
-        parts.append(text)
+        parts.append(f"[Doc_{i+1}] {text}")
     return "\n\n".join(parts)
 
 
 def run_multi_turn(retriever, user_id, qa_pairs, model, top_k,
-                   use_reorder=False, cp_available=False):
+                   use_reorder=False, cp_available=False, repeat_times=1):
     """Run multi-turn benchmark: baseline vs reorder."""
     label = "reorder" if use_reorder else "baseline"
-    print(f"\n--- {label} ({NUM_TURNS} turns, k={top_k}) ---")
+    actual_k = top_k * repeat_times if repeat_times > 1 else top_k
+    suffix = f" (k={top_k}x{repeat_times}={actual_k} docs)" if repeat_times > 1 else f" (k={top_k})"
+    print(f"\n--- {label} ({NUM_TURNS} turns,{suffix}) ---")
 
     ttfts, prefix_matches, f1s, judges = [], [], [], []
 
@@ -146,6 +176,11 @@ def run_multi_turn(retriever, user_id, qa_pairs, model, top_k,
         cmap = retriever.get_corpus_map()
         doc_ids = s[0]["top_k_doc_id"]
 
+        # Repeat docs to create long context if requested
+        if repeat_times > 1:
+            doc_ids = doc_ids * repeat_times
+
+        original_ids = list(doc_ids)  # preserve original retrieval order
         reordered_ids = doc_ids
         req_id = None
         server_prefix_len, server_has_prefix, server_node_id = 0, False, -1
@@ -179,10 +214,19 @@ def run_multi_turn(retriever, user_id, qa_pairs, model, top_k,
         # Build context string directly from corpus map
         context_str = build_context_str(reordered_ids, cmap)
 
+        # Build importance ranking — always include so prompt length is equal
+        # between baseline and reorder (fair TTFT comparison).
+        # Baseline: natural order [Doc_1] > [Doc_2] > ...
+        # Reorder:  original retrieval order mapped to reordered positions
+        if use_reorder and reordered_ids != original_ids:
+            importance_ranking = build_importance_ranking(original_ids, reordered_ids)
+        else:
+            importance_ranking = " > ".join(f"[Doc_{i+1}]" for i in range(len(reordered_ids)))
+
         # Build prompt and measure TTFT
-        prompt = build_prompt(qa["question"], context_str)
+        prompt = build_prompt(qa["question"], context_str, importance_ranking)
         out = run_ttft(prompt, model, MAX_GEN, request_id=req_id)
-        gt = str(qa["answer"])
+        gt = str(qa.get("answer", qa.get("answers", qa.get("gold_answer", ""))))
 
         if idx > 0:
             ttfts.append(out["ttft"])
@@ -219,6 +263,7 @@ def run_multi_turn(retriever, user_id, qa_pairs, model, top_k,
         "prefix": avg(prefix_matches),
         "f1": avg(f1s),
         "judge": avg(judges),
+        "repeat": repeat_times,
     }
     print(f"  [{label}] TTFT={stats['ttft']:.4f}s  Prefix={stats['prefix']:.1%}"
           f"  F1={stats['f1']:.3f}  Judge={stats['judge']:.3f}")
@@ -283,60 +328,124 @@ def ingest_conversation(conv_data, retriever, user_id):
         run_ttft("Hello, world.", model, max_tokens=4)
     print("Warmup done.\n")
 
-    retriever = Mem0Retriever(config={
-        "llm": {"provider": "openai", "config": {"model": "gpt-4.1-mini-2025-04-14"}},
-        "embedder": {"provider": "openai", "config": {"model": "text-embedding-3-small"}},
-    })
-
-    conv_data = all_convs[CONV_INDEX]
-    qa_pairs = conv_data["qa"][:MAX_QA]
-    conv = conv_data["conversation"]
-    print(f"\n{'='*70}")
-    print(f"CONV {CONV_INDEX}: {conv['speaker_a']} & {conv['speaker_b']}, {len(qa_pairs)} QA pairs")
-    print(f"{'='*70}")
-
-    user_id = f"locomo_{CONV_INDEX}_{uuid.uuid4().hex[:6]}"
-    n_memories = ingest_conversation(conv_data, retriever, user_id)
-    top_k_values = [int(k) for k in TOP_K_LIST.split(",")]
+    # Parse TOP_K_LIST: supports "20", "50", or "5x10" (k=5, repeat 10 times)
+    top_k_configs = []
+    for entry in TOP_K_LIST.split(","):
+        entry = entry.strip()
+        if "x" in entry:
+            k_str, r_str = entry.split("x", 1)
+            top_k_configs.append((int(k_str), int(r_str)))
+        else:
+            top_k_configs.append((int(entry), 1))
+
+    # Determine which conversations to run
+    if CONV_INDEX == "all":
+        conv_indices = list(range(len(all_convs)))
+    else:
+        conv_indices = [int(CONV_INDEX)]
+
+    grand_rows = []  # aggregate across all conversations
+
+    for ci in conv_indices:
+        # Flush SGLang's radix cache between conversations to avoid pressure buildup
+        try:
+            requests.post(f"{INFERENCE_URL}/flush_cache", timeout=5)
+        except Exception:
+            pass
 
-    try:
-        all_rows = []
-        for top_k in top_k_values:
-            print(f"\n## top_k={top_k}")
-            results = {}
-            for use_reorder in [True, False]:
-                cp_reset()  # fresh tree for each mode
-                stats = run_multi_turn(
-                    retriever, user_id, qa_pairs, model, top_k,
-                    use_reorder=use_reorder, cp_available=cp_available)
-                results[stats["label"]] = stats
-
-            base_ttft = results["baseline"]["ttft"]
-
-            for name in ["baseline", "reorder"]:
-                s = results[name]
-                delta = (base_ttft - s["ttft"]) / base_ttft * 100 if base_ttft else 0
-                all_rows.append({
-                    "k": top_k,
-                    "mode": name,
-                    "ttft": f"{s['ttft']:.4f}s",
-                    "ttft_delta": f"{delta:+.1f}%" if name != "baseline" else "-",
-                    "prefix": f"{s['prefix']:.1%}",
-                    "f1": f"{s['f1']:.3f}",
-                    "judge": f"{s['judge']:.3f}",
-                })
-
-        # Summary table
+        conv_data = all_convs[ci]
+        qa_pairs = conv_data["qa"][:MAX_QA]
+        conv = conv_data["conversation"]
         print(f"\n{'='*70}")
-        print(f"RESULTS (conv={CONV_INDEX}, memories={n_memories}, turns={min(NUM_TURNS, len(qa_pairs))})")
+        print(f"CONV {ci}: {conv['speaker_a']} & {conv['speaker_b']}, {len(qa_pairs)} QA pairs")
         print(f"{'='*70}")
-        print(pd.DataFrame(all_rows).to_string(index=False))
 
-    finally:
+        retriever = Mem0Retriever(config={
+            "llm": {"provider": "openai", "config": {"model": "gpt-4.1-mini-2025-04-14"}},
+            "embedder": {"provider": "openai", "config": {"model": "text-embedding-3-small"}},
+        })
+
+        user_id = f"locomo_{ci}_{uuid.uuid4().hex[:6]}"
+        n_memories = ingest_conversation(conv_data, retriever, user_id)
+
         try:
-            retriever.delete_all_memories(user_id=user_id)
-            print(f"\nCleaned up memories for {user_id}")
-        except Exception as e:
-            print(f"\nCleanup warning: {e}")
-        del retriever
-        import gc; gc.collect()
+            conv_rows = []
+            for top_k, repeat_times in top_k_configs:
+                label = f"top_k={top_k}" + (f"x{repeat_times}" if repeat_times > 1 else "")
+                print(f"\n## {label}")
+                results = {}
+                for use_reorder in [False, True]:
+                    cp_reset()  # fresh tree for each mode
+                    stats = run_multi_turn(
+                        retriever, user_id, qa_pairs, model, top_k,
+                        use_reorder=use_reorder, cp_available=cp_available,
+                        repeat_times=repeat_times)
+                    results[stats["label"]] = stats
+
+                base_ttft = results["baseline"]["ttft"]
+
+                k_label = f"{top_k}x{repeat_times}" if repeat_times > 1 else str(top_k)
+                for name in ["baseline", "reorder"]:
+                    s = results[name]
+                    delta = (base_ttft - s["ttft"]) / base_ttft * 100 if base_ttft else 0
+                    row = {
+                        "conv": ci,
+                        "k": k_label,
+                        "mode": name,
+                        "ttft": s["ttft"],
+                        "ttft_delta": delta if name != "baseline" else 0,
+                        "prefix": s["prefix"],
+                        "f1": s["f1"],
+                        "judge": s["judge"],
+                    }
+                    conv_rows.append(row)
+                    grand_rows.append(row)
+
+            # Per-conversation summary
+            print(f"\n{'='*70}")
+            print(f"RESULTS (conv={ci}, memories={n_memories}, turns={min(NUM_TURNS, len(qa_pairs))})")
+            print(f"{'='*70}")
+            df = pd.DataFrame(conv_rows)
+            df_display = df.copy()
+            df_display["ttft"] = df_display["ttft"].map(lambda x: f"{x:.4f}s")
+            df_display["ttft_delta"] = df.apply(
+                lambda r: f"{r['ttft_delta']:+.1f}%" if r["mode"] != "baseline" else "-", axis=1)
+            df_display["prefix"] = df_display["prefix"].map(lambda x: f"{x:.1%}")
+            df_display["f1"] = df_display["f1"].map(lambda x: f"{x:.3f}")
+            df_display["judge"] = df_display["judge"].map(lambda x: f"{x:.3f}")
+            print(df_display.drop(columns=["conv"]).to_string(index=False))
+
+        finally:
+            try:
+                retriever.delete_all_memories(user_id=user_id)
+                print(f"\nCleaned up memories for {user_id}")
+            except Exception as e:
+                print(f"\nCleanup warning: {e}")
+            del retriever
+            import gc; gc.collect()
+
+    # Grand aggregate table across all conversations
+    if len(conv_indices) > 1:
+        print(f"\n{'='*70}")
+        print(f"AGGREGATE RESULTS ({len(conv_indices)} conversations)")
+        print(f"{'='*70}")
+        gdf = pd.DataFrame(grand_rows)
+        agg = gdf.groupby(["k", "mode"]).agg(
+            ttft=("ttft", "mean"),
+            prefix=("prefix", "mean"),
+            f1=("f1", "mean"),
+            judge=("judge", "mean"),
+        ).reset_index()
+        # Compute delta from baseline per k
+        for k_val in agg["k"].unique():
+            base = agg.loc[(agg["k"] == k_val) & (agg["mode"] == "baseline"), "ttft"].values[0]
+            agg.loc[agg["k"] == k_val, "ttft_delta"] = agg.loc[agg["k"] == k_val, "ttft"].apply(
+                lambda x: (base - x) / base * 100 if base else 0)
+        agg_display = agg.copy()
+        agg_display["ttft"] = agg_display["ttft"].map(lambda x: f"{x:.4f}s")
+        agg_display["ttft_delta"] = agg.apply(
+            lambda r: f"{r['ttft_delta']:+.1f}%" if r["mode"] != "baseline" else "-", axis=1)
+        agg_display["prefix"] = agg_display["prefix"].map(lambda x: f"{x:.1%}")
+        agg_display["f1"] = agg_display["f1"].map(lambda x: f"{x:.3f}")
+        agg_display["judge"] = agg_display["judge"].map(lambda x: f"{x:.3f}")
+        print(agg_display[["k", "mode", "ttft", "ttft_delta", "prefix", "f1", "judge"]].to_string(index=False))