From b62922ed89bf1ae2e85554c46e835c9b942f7536 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@kernel.org>
Date: Thu, 26 Feb 2026 13:39:41 -0500
Subject: [PATCH 01/10] lore: Avoid repeatedly downloading lore archives

Lore indexing required scanning the email table to determine which
commits had already been processed. The lore table contains parsed
email records, not a direct mapping of indexed commits, making
duplicate detection both slow and unreliable.

A dedicated lore_indexed_commits table now tracks processed git
commit SHAs. After successful insertion of lore emails, commit
SHAs are recorded in this table via merge_insert keyed on
git_commit_sha, so repeated SHAs are absorbed rather than
duplicated. Subsequent runs load the full table into a HashSet
to skip already-processed commits, avoiding redundant downloads
and parsing of mailing list archives. The table contains only
short SHA strings, so reading it entirely into memory is
inexpensive. The table has a single git_commit_sha column and
integrates into schema initialization and repair.

Fixes: 39ae6a3790af ("semcode-index: Add --refresh-lore to update tracked archives")
Signed-off-by: Chuck Lever <cel@kernel.org>
---
 scripts/direct_download.py |   0
 scripts/nomic2vec.py       |   0
 src/bin/index.rs           |   4 +-
 src/database/connection.rs | 129 +++++++++++++++++++++++++------------
 src/database/schema.rs     |  25 +++++++
 src/indexer.rs             |  17 +++++
 6 files changed, 131 insertions(+), 44 deletions(-)
 mode change 100644 => 100755 scripts/direct_download.py
 mode change 100644 => 100755 scripts/nomic2vec.py
diff --git a/scripts/direct_download.py b/scripts/direct_download.py
old mode 100644
new mode 100755
diff --git a/scripts/nomic2vec.py b/scripts/nomic2vec.py
old mode 100644
new mode 100755
diff --git a/src/bin/index.rs b/src/bin/index.rs
index accdd15..1399a53 100644
--- a/src/bin/index.rs
+++ b/src/bin/index.rs
@@ -660,10 +660,10 @@ async fn index_lore_archive(
     let total_commits = all_commit_shas.len();
     info!("Found {} total commits in lore archive", total_commits);
 
-    // Get already indexed commits from database using efficient batched queries
+    // Get already indexed commits from database
     println!("Checking for already-indexed commits...");
     let existing_commits = db_manager
-        .filter_existing_lore_commits(&all_commit_shas)
+        .get_indexed_lore_commits()
         .await?;
 
     // Filter out already indexed commits
diff --git a/src/database/connection.rs b/src/database/connection.rs
index 51679a2..e5b89d0 100644
--- a/src/database/connection.rs
+++ b/src/database/connection.rs
@@ -83,6 +83,7 @@ impl DatabaseManager {
             "symbol_filename",
             "git_commits",
             "lore",
+            "lore_indexed_commits",
             "indexed_branches",
         ] {
             if let Ok(table) = self.connection.open_table(*table_name).execute().await {
@@ -3758,6 +3759,26 @@ impl DatabaseManager {
             Field::new("symbols", DataType::Utf8, false),
         ]));
 
+        // Deduplicate by message_id within the batch. A lore archive
+        // can contain the same email in multiple git commits, and
+        // LanceDB merge_insert requires each target row to be matched
+        // by at most one source row. Keep the last occurrence.
+        let mut seen = std::collections::HashMap::with_capacity(emails.len());
+        for (i, email) in emails.iter().enumerate() {
+            seen.insert(&email.message_id, i);
+        }
+        let mut dedup_indices: Vec<usize> = seen.into_values().collect();
+        dedup_indices.sort_unstable();
+
+        let dedup_count = emails.len() - dedup_indices.len();
+        if dedup_count > 0 {
+            tracing::info!(
+                "insert_lore_emails: Removed {} duplicate message_ids from batch of {}",
+                dedup_count,
+                emails.len()
+            );
+        }
+
         let mut git_commit_shas = Vec::new();
         let mut from_addrs = Vec::new();
         let mut dates = Vec::new();
@@ -3770,7 +3791,8 @@ impl DatabaseManager {
         let mut bodies = Vec::new();
         let mut symbols_list = Vec::new();
 
-        for email in emails {
+        for &idx in &dedup_indices {
+            let email = &emails[idx];
             git_commit_shas.push(email.git_commit_sha.clone());
             from_addrs.push(email.from.clone());
             dates.push(email.date.clone());
@@ -5331,49 +5353,34 @@ impl DatabaseManager {
         Ok(commits)
     }
 
-    /// Filter a list of commit SHAs to return only those that already exist in the lore table.
-    /// Uses batched IN queries to avoid full table scans.
-    /// Returns a HashSet of SHAs that exist in the database.
-    pub async fn filter_existing_lore_commits(
-        &self,
-        commit_shas: &[String],
-    ) -> Result<HashSet<String>> {
-        if commit_shas.is_empty() {
-            return Ok(HashSet::new());
-        }
-
-        let table = self.connection.open_table("lore").execute().await?;
-        let mut existing = HashSet::with_capacity(commit_shas.len() / 2); // Pre-allocate for typical case
-
-        // Process in chunks to avoid SQL query size limits
-        // LanceDB/DuckDB can handle large IN clauses, but 1000 is a safe batch size
-        const BATCH_SIZE: usize = 1000;
-
-        for chunk in commit_shas.chunks(BATCH_SIZE) {
-            // Build SQL IN clause with properly escaped SHAs
-            let escaped_shas: Vec<String> = chunk
-                .iter()
-                .map(|sha| format!("'{}'", sha.replace("'", "''")))
-                .collect();
-            let filter = format!("git_commit_sha IN ({})", escaped_shas.join(", "));
-
-            let stream = table
-                .query()
-                .select(lancedb::query::Select::Columns(vec![
-                    "git_commit_sha".to_string()
-                ]))
-                .only_if(&filter)
-                .execute()
-                .await?;
+    /// Return the set of commit SHAs already recorded in the
+    /// lore_indexed_commits table. The table contains only short
+    /// SHA strings, so reading it entirely into memory is cheap.
+    pub async fn get_indexed_lore_commits(&self) -> Result<HashSet<String>> {
+        let table = match self.connection.open_table("lore_indexed_commits").execute().await {
+            Ok(t) => t,
+            Err(e) => {
+                tracing::warn!("Failed to open lore_indexed_commits table: {}", e);
+                return Ok(HashSet::new());
+            }
+        };
 
-            let batches: Vec<_> = stream.try_collect().await?;
+        let stream = table
+            .query()
+            .select(lancedb::query::Select::Columns(vec![
+                "git_commit_sha".to_string(),
+            ]))
+            .execute()
+            .await?;
 
-            for batch in batches {
-                if let Some(column) = batch.column_by_name("git_commit_sha") {
-                    if let Some(string_array) = column.as_any().downcast_ref::<StringArray>() {
-                        for i in 0..string_array.len() {
-                            existing.insert(string_array.value(i).to_string());
-                        }
+        let batches: Vec<_> = stream.try_collect().await?;
+        let mut existing = HashSet::new();
+        for batch in batches {
+            if let Some(column) = batch.column_by_name("git_commit_sha") {
+                if let Some(string_array) = column.as_any().downcast_ref::<StringArray>() {
+                    existing.reserve(string_array.len());
+                    for i in 0..string_array.len() {
+                        existing.insert(string_array.value(i).to_string());
                     }
                 }
             }
@@ -5381,4 +5388,42 @@ impl DatabaseManager {
 
         Ok(existing)
     }
+
+    /// Record git commit SHAs that have been processed for lore indexing.
+    pub async fn insert_lore_indexed_commits(&self, commit_shas: &[String]) -> Result<()> {
+        use arrow::array::{ArrayRef, StringArray};
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::record_batch::RecordBatch;
+        use std::sync::Arc;
+
+        if commit_shas.is_empty() {
+            return Ok(());
+        }
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("git_commit_sha", DataType::Utf8, false),
+        ]));
+
+        let columns: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(commit_shas.to_vec())),
+        ];
+
+        let batch = RecordBatch::try_new(schema.clone(), columns)?;
+        let batches = vec![Ok(batch)];
+        let batch_iterator =
+            arrow::record_batch::RecordBatchIterator::new(batches.into_iter(), schema);
+
+        let table = self
+            .connection
+            .open_table("lore_indexed_commits")
+            .execute()
+            .await?;
+        let mut merge_insert = table.merge_insert(&["git_commit_sha"]);
+        merge_insert
+            .when_matched_update_all(None)
+            .when_not_matched_insert_all();
+        merge_insert.execute(Box::new(batch_iterator)).await?;
+
+        Ok(())
+    }
 }
diff --git a/src/database/schema.rs b/src/database/schema.rs
index 9fff3a8..019268e 100644
--- a/src/database/schema.rs
+++ b/src/database/schema.rs
@@ -65,6 +65,10 @@ impl SchemaManager {
             self.create_lore_table().await?;
         }
 
+        if !table_names.iter().any(|n| n == "lore_indexed_commits") {
+            self.create_lore_indexed_commits_table().await?;
+        }
+
         if !table_names.iter().any(|n| n == "lore_vectors") {
             self.create_lore_vectors_table().await?;
         }
@@ -292,6 +296,24 @@ impl SchemaManager {
         Ok(())
     }
 
+    async fn create_lore_indexed_commits_table(&self) -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("git_commit_sha", DataType::Utf8, false),
+        ]));
+
+        let empty_batch = RecordBatch::new_empty(schema.clone());
+        let batches = vec![Ok(empty_batch)];
+        let batch_iterator = RecordBatchIterator::new(batches.into_iter(), schema);
+
+        self.connection
+            .create_table("lore_indexed_commits", batch_iterator)
+            .execute()
+            .await?;
+
+        tracing::info!("Created lore_indexed_commits table");
+        Ok(())
+    }
+
     async fn create_indexed_branches_table(&self) -> Result<()> {
         use crate::database::branches::IndexedBranchStore;
 
@@ -885,6 +907,7 @@ impl SchemaManager {
             "git_commits".to_string(),
             "lore".to_string(),
             "indexed_branches".to_string(),
+            "lore_indexed_commits".to_string(),
         ];
 
         // Add all content shard tables
@@ -1064,6 +1087,7 @@ impl SchemaManager {
             "git_commits",
             "lore",
             "indexed_branches",
+            "lore_indexed_commits",
         ];
 
         // Add all content shard tables
@@ -1235,6 +1259,7 @@ impl SchemaManager {
             "symbol_filename" => self.create_symbol_filename_table().await,
             "git_commits" => self.create_git_commits_table().await,
             "lore" => self.create_lore_table().await,
+            "lore_indexed_commits" => self.create_lore_indexed_commits_table().await,
             "indexed_branches" => self.create_indexed_branches_table().await,
             "content" => self.create_content_table().await,
             name if name.starts_with("content_") => {
diff --git a/src/indexer.rs b/src/indexer.rs
index 58675bd..328aa3b 100644
--- a/src/indexer.rs
+++ b/src/indexer.rs
@@ -966,6 +966,23 @@ pub async fn process_lore_commits_pipeline(
                         if let Err(e) = db_manager_clone.insert_lore_emails(&emails).await {
                             error!("Inserter {} failed to insert batch: {}", inserter_id, e);
                         } else {
+                            // Record processed commit SHAs so they are
+                            // not re-examined on subsequent runs.
+                            let shas: Vec<String> = emails.iter()
+                                .map(|e| e.git_commit_sha.as_str())
+                                .collect::<std::collections::HashSet<_>>()
+                                .into_iter()
+                                .map(String::from)
+                                .collect();
+                            if let Err(e) = db_manager_clone
+                                .insert_lore_indexed_commits(&shas).await
+                            {
+                                error!(
+                                    "Inserter {} failed to record indexed commits: {}",
+                                    inserter_id, e
+                                );
+                            }
+
                             let count = inserted_clone.fetch_add(batch_len, Ordering::Relaxed);
                             pb_clone.set_message(format!("Inserted {} emails", count + batch_len));
 

From 6697886b1d04b1c47cddda0aba7a51b5daf36708 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@kernel.org>
Date: Wed, 11 Feb 2026 11:38:22 -0500
Subject: [PATCH 02/10] lore: Process archive refreshes sequentially

The --lore refresh path uses buffer_unordered() to index
up to four archives concurrently.  Each archive pipeline
spawns its own set of database inserter tasks, all sharing
the same LanceDB connection and its underlying DataFusion
memory pool.

With large-row archives such as oe-kbuild-all, concurrent
merge_insert operations from separate pipelines exhaust
the memory pool simultaneously.  Neither pipeline can
make progress because each holds a portion of the pool
while waiting for more, producing a resource deadlock
visible as two frozen progress bars with unchanging
"Inserted N emails" counts.

Replace buffer_unordered() with a sequential loop,
matching the approach already used by the --lore <args>
initial-clone path.  The git fetch for each archive still
runs inline, so network latency is the only cost; the
database insertion -- which dominates wall-clock time --
no longer contends for the shared memory pool.

Fixes: 39ae6a3790af ("semcode-index: Add --refresh-lore to update tracked archives")
Signed-off-by: Chuck Lever <cel@kernel.org>
---
 src/bin/index.rs | 76 ++++++++++++++++++++----------------------------
 1 file changed, 32 insertions(+), 44 deletions(-)

diff --git a/src/bin/index.rs b/src/bin/index.rs
index 1399a53..3b0533b 100644
--- a/src/bin/index.rs
+++ b/src/bin/index.rs
@@ -2,7 +2,6 @@
 use anyhow::Result;
 use clap::Parser;
 use colored::Colorize;
-use futures::stream::{self, StreamExt};
 use semcode::indexer::{
     list_shas_in_range, process_commits_pipeline, process_lore_commits_pipeline,
 };
@@ -1173,51 +1172,40 @@ async fn main() -> Result<()> {
             let db_threads = args.db_threads;
             let total_archives = archives_with_names.len();
 
-            // Process archives in parallel (up to 4 concurrent fetches/indexes)
-            // This provides significant speedup when tracking multiple mailing lists
-            let concurrency = std::cmp::min(4, total_archives);
+            // Process archives sequentially.  LanceDB merge_insert
+            // uses a shared DataFusion memory pool, and concurrent
+            // pipelines writing large lore emails exhaust it,
+            // causing both to stall indefinitely.
+            let mut results: Vec<Result<(String, LoreIndexResult), (String, anyhow::Error)>> =
+                Vec::with_capacity(total_archives);
 
-            println!(
-                "Processing {} archives with concurrency {}...",
-                total_archives, concurrency
-            );
+            for (archive_path, display_name) in archives_with_names {
+                println!("\n=== Refreshing lore archive: {} ===", display_name);
+                println!("[{}] Fetching updates from remote...", display_name);
 
-            let results: Vec<Result<(String, LoreIndexResult), (String, anyhow::Error)>> =
-                stream::iter(archives_with_names)
-                    .map(|(archive_path, display_name)| {
-                        let db_manager = db_manager.clone();
-                        async move {
-                            println!("\n=== Refreshing lore archive: {} ===", display_name);
-
-                            // Fetch new commits from remote (async, runs on blocking thread pool)
-                            println!("[{}] Fetching updates from remote...", display_name);
-                            let lore_repo = match fetch_lore_archive(archive_path.clone()).await {
-                                Ok(repo) => repo,
-                                Err(e) => {
-                                    return Err((display_name, e));
-                                }
-                            };
-
-                            // Index the archive using the shared function
-                            match index_lore_archive(
-                                lore_repo,
-                                &archive_path,
-                                &display_name,
-                                &db_manager,
-                                batch_size,
-                                num_workers,
-                                db_threads,
-                            )
-                            .await
-                            {
-                                Ok(result) => Ok((display_name, result)),
-                                Err(e) => Err((display_name, e)),
-                            }
-                        }
-                    })
-                    .buffer_unordered(concurrency)
-                    .collect()
-                    .await;
+                let lore_repo = match fetch_lore_archive(archive_path.clone()).await {
+                    Ok(repo) => repo,
+                    Err(e) => {
+                        results.push(Err((display_name, e)));
+                        continue;
+                    }
+                };
+
+                match index_lore_archive(
+                    lore_repo,
+                    &archive_path,
+                    &display_name,
+                    &db_manager,
+                    batch_size,
+                    num_workers,
+                    db_threads,
+                )
+                .await
+                {
+                    Ok(result) => results.push(Ok((display_name, result))),
+                    Err(e) => results.push(Err((display_name, e))),
+                }
+            }
 
             // Aggregate results
             let mut total_new_emails = 0usize;

From 16c30d4d106d046f93e733d836c98f9b4412a384 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@kernel.org>
Date: Tue, 10 Feb 2026 17:41:12 -0500
Subject: [PATCH 03/10] lore: Retry failed chunk inserts individually

insert_lore_emails() feeds an entire pipeline batch (up to
1024 emails) into a single LanceDB merge_insert call. Each
lore email carries full headers and body text, so the
resulting RecordBatch is far larger than a typical
code-analysis batch. LanceDB merge_insert uses DataFusion's
RepartitionExec internally, and the oversized batch exhausts
the DataFusion memory pool -- particularly when two inserter
tasks submit concurrently.

The failure manifests as:

  Resources exhausted: Failed to allocate additional
  11.6 MB for RepartitionExec[0] with 11.9 MB already
  allocated for this reservation

Split the deduplicated email indices into chunks of 128 and
issue a separate merge_insert per chunk to bound peak memory
per operation. When a chunk still fails (e.g. a single
email is large enough to exhaust the pool on its own), fall
back to inserting each email in the chunk individually so
that only genuinely uninsertable messages are skipped.

Signed-off-by: Chuck Lever <cel@kernel.org>
---
 src/database/connection.rs | 121 +++++++++++++++++++++++++++++--------
 1 file changed, 97 insertions(+), 24 deletions(-)

diff --git a/src/database/connection.rs b/src/database/connection.rs
index e5b89d0..f0bd851 100644
--- a/src/database/connection.rs
+++ b/src/database/connection.rs
@@ -3731,9 +3731,7 @@ impl DatabaseManager {
 
     /// Insert lore emails into the database
     pub async fn insert_lore_emails(&self, emails: &[crate::types::LoreEmailInfo]) -> Result<()> {
-        use arrow::array::{ArrayRef, StringArray};
         use arrow::datatypes::{DataType, Field, Schema};
-        use arrow::record_batch::RecordBatch;
         use std::sync::Arc;
 
         if emails.is_empty() {
@@ -3779,19 +3777,99 @@ impl DatabaseManager {
             );
         }
 
-        let mut git_commit_shas = Vec::new();
-        let mut from_addrs = Vec::new();
-        let mut dates = Vec::new();
-        let mut message_ids = Vec::new();
-        let mut in_reply_tos = Vec::new();
-        let mut subjects = Vec::new();
-        let mut references_list = Vec::new();
-        let mut recipients_list = Vec::new();
-        let mut headers_list = Vec::new();
-        let mut bodies = Vec::new();
-        let mut symbols_list = Vec::new();
+        let table = self.connection.open_table("lore").execute().await?;
+
+        // Try inserting the full batch first -- a single merge_insert
+        // is far cheaper than many small ones because each call is a
+        // full read-modify-write cycle in LanceDB.  Fall back to
+        // chunked insertion only when the batch exhausts DataFusion's
+        // memory pool (the RepartitionExec OOM described in the
+        // comment below).
+        //
+        // Retrying the full set of indices on failure is safe:
+        // merge_insert is not transactional, so a failed call may
+        // have persisted some rows before the error.  Because the
+        // upsert key is message_id, re-inserting those rows is
+        // idempotent.
+        if let Err(e) =
+            Self::merge_insert_lore_chunk(&table, emails, &dedup_indices, &schema).await
+        {
+            tracing::warn!(
+                "insert_lore_emails: full batch of {} failed ({}), \
+                 falling back to chunked insertion",
+                dedup_indices.len(),
+                e
+            );
+
+            // Lore emails carry full headers and bodies, so each row
+            // is large compared to code-analysis records.  LanceDB
+            // merge_insert uses DataFusion's RepartitionExec, whose
+            // memory pool can be exhausted by a single oversized
+            // RecordBatch.  Insert in sub-batches to bound peak
+            // memory per operation.
+            const MAX_CHUNK: usize = 128;
+
+            for chunk in dedup_indices.chunks(MAX_CHUNK) {
+                if let Err(e) =
+                    Self::merge_insert_lore_chunk(&table, emails, chunk, &schema).await
+                {
+                    tracing::warn!(
+                        "insert_lore_emails: chunk of {} failed ({}), \
+                         retrying individually",
+                        chunk.len(),
+                        e
+                    );
+                    for &idx in chunk {
+                        if let Err(e2) =
+                            Self::merge_insert_lore_chunk(
+                                &table, emails, &[idx], &schema,
+                            )
+                            .await
+                        {
+                            tracing::warn!(
+                                "insert_lore_emails: skipping \
+                                 message_id={}: {}",
+                                emails[idx].message_id,
+                                e2
+                            );
+                        }
+                    }
+                }
+            }
+        }
 
-        for &idx in &dedup_indices {
+        tracing::info!("insert_lore_emails: Batch insertion complete");
+
+        Ok(())
+    }
+
+    /// Build a [`RecordBatch`] from the given email indices and
+    /// merge-insert it into the lore table.
+    async fn merge_insert_lore_chunk(
+        table: &lancedb::Table,
+        emails: &[crate::types::LoreEmailInfo],
+        indices: &[usize],
+        schema: &std::sync::Arc<arrow::datatypes::Schema>,
+    ) -> Result<()> {
+        use arrow::array::{ArrayRef, StringArray};
+        use arrow::record_batch::RecordBatch;
+        use std::sync::Arc;
+
+        debug_assert!(indices.iter().all(|&i| i < emails.len()));
+
+        let mut git_commit_shas = Vec::with_capacity(indices.len());
+        let mut from_addrs = Vec::with_capacity(indices.len());
+        let mut dates = Vec::with_capacity(indices.len());
+        let mut message_ids = Vec::with_capacity(indices.len());
+        let mut in_reply_tos = Vec::with_capacity(indices.len());
+        let mut subjects = Vec::with_capacity(indices.len());
+        let mut references_list = Vec::with_capacity(indices.len());
+        let mut recipients_list = Vec::with_capacity(indices.len());
+        let mut headers_list = Vec::with_capacity(indices.len());
+        let mut bodies = Vec::with_capacity(indices.len());
+        let mut symbols_list = Vec::with_capacity(indices.len());
+
+        for &idx in indices {
             let email = &emails[idx];
             git_commit_shas.push(email.git_commit_sha.clone());
             from_addrs.push(email.from.clone());
@@ -3803,9 +3881,7 @@ impl DatabaseManager {
             recipients_list.push(email.recipients.clone());
             headers_list.push(email.headers.clone());
             bodies.push(email.body.clone());
-            // Convert Vec<String> to JSON array string
-            let symbols_json = serde_json::to_string(&email.symbols)?;
-            symbols_list.push(symbols_json);
+            symbols_list.push(serde_json::to_string(&email.symbols)?);
         }
 
         let columns: Vec<ArrayRef> = vec![
@@ -3825,19 +3901,16 @@ impl DatabaseManager {
         let batch = RecordBatch::try_new(schema.clone(), columns)?;
         let batches = vec![Ok(batch)];
         let batch_iterator =
-            arrow::record_batch::RecordBatchIterator::new(batches.into_iter(), schema);
-
-        let table = self.connection.open_table("lore").execute().await?;
+            arrow::record_batch::RecordBatchIterator::new(
+                batches.into_iter(),
+                schema.clone(),
+            );
 
-        // Use merge_insert for upsert functionality (update if exists, insert if not)
         let mut merge_insert = table.merge_insert(&["message_id"]);
         merge_insert
             .when_matched_update_all(None)
             .when_not_matched_insert_all();
         merge_insert.execute(Box::new(batch_iterator)).await?;
-
-        tracing::info!("insert_lore_emails: Batch insertion complete");
-
         Ok(())
     }
 

From 61e54930490e5d8e7a396e982a7b98afede693d6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@kernel.org>
Date: Wed, 11 Feb 2026 16:00:36 -0500
Subject: [PATCH 04/10] semcode: Guard against compaction hang with many
 fragments

LanceDB compaction encounters a pathological case when a table
accumulates thousands of small fragments. The compact operation
enters a CPU loop where the main thread spins at 100% CPU
utilization while worker threads remain idle. Any table
subjected to many small appends without intervening compaction
can reach this state.

A check now examines fragment count before compaction proceeds.
When fragment count exceeds 500, compaction is skipped and a
warning directs the user to rebuild the database with --clear.
This threshold prevents the hang condition while allowing normal
compaction for tables with moderate fragmentation. Prune, index,
and checkout operations remain unaffected; only the compact step
is gated by this fragment limit.

Fixes: 4a16e152fafb ("semcode-index: optimize database periodically during long-running indexing")
Signed-off-by: Chuck Lever <cel@kernel.org>
---
 src/database/schema.rs | 52 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/src/database/schema.rs b/src/database/schema.rs
index 019268e..def10f6 100644
--- a/src/database/schema.rs
+++ b/src/database/schema.rs
@@ -1016,15 +1016,49 @@ impl SchemaManager {
         let mut success = true;
 
         // 1. Compact files
-        if let Err(e) = table
-            .optimize(OptimizeAction::Compact {
-                options: Default::default(),
-                remap_options: None,
-            })
-            .await
-        {
-            tracing::warn!("Failed to compact table {}: {}", table_name, e);
-            success = false;
+        const MAX_COMPACT_FRAGMENTS: usize = 500;
+
+        let should_compact = match table.stats().await {
+            Ok(stats)
+                if stats.fragment_stats.num_fragments
+                    > MAX_COMPACT_FRAGMENTS =>
+            {
+                tracing::warn!(
+                    "Skipping compaction for table {} \
+                     ({} fragments exceeds {} limit -- \
+                     rebuild with --clear to resolve)",
+                    table_name,
+                    stats.fragment_stats.num_fragments,
+                    MAX_COMPACT_FRAGMENTS
+                );
+                false
+            }
+            Ok(_) => true,
+            Err(e) => {
+                tracing::warn!(
+                    "Failed to read stats for table {}: {}",
+                    table_name,
+                    e
+                );
+                true
+            }
+        };
+
+        if should_compact {
+            if let Err(e) = table
+                .optimize(OptimizeAction::Compact {
+                    options: Default::default(),
+                    remap_options: None,
+                })
+                .await
+            {
+                tracing::warn!(
+                    "Failed to compact table {}: {}",
+                    table_name,
+                    e
+                );
+                success = false;
+            }
         }
 
         // 2. Prune ALL old versions

From f6f51d66c633cad82cf9fe6fcd738d63ef08674c Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@kernel.org>
Date: Thu, 26 Feb 2026 11:00:29 -0500
Subject: [PATCH 05/10] database: Use merge_insert for vectors table

VectorStore::insert_chunk() uses bare table.add() to append
vector entries. When functions are re-vectorized after
reindexing, entries with the same content_hash accumulate as
duplicates rather than being replaced, and each insertion
creates a new fragment. The content table already handles this
correctly with merge_insert keyed on blake3_hash.

Apply the same pattern here: merge_insert keyed on
content_hash, matching the deduplication strategy used
elsewhere in the database layer.

Existing databases must be rebuilt with --clear to reclaim
the space already wasted by duplicate vector entries.

Signed-off-by: Chuck Lever <cel@kernel.org>
---
 src/database/vectors.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/database/vectors.rs b/src/database/vectors.rs
index 727e2bc..745c339 100644
--- a/src/database/vectors.rs
+++ b/src/database/vectors.rs
@@ -89,7 +89,12 @@ impl VectorStore {
 
         let batches = vec![Ok(batch)];
         let batch_iterator = RecordBatchIterator::new(batches.into_iter(), schema);
-        table.add(batch_iterator).execute().await?;
+
+        let mut merge_insert = table.merge_insert(&["content_hash"]);
+        merge_insert
+            .when_matched_update_all(None)
+            .when_not_matched_insert_all();
+        merge_insert.execute(Box::new(batch_iterator)).await?;
 
         Ok(())
     }

From 131656673a27358e896c790a364a1ba191258122 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@kernel.org>
Date: Thu, 26 Feb 2026 11:27:40 -0500
Subject: [PATCH 06/10] database: Use merge_insert for commit_vectors table

insert_commit_vectors_batch_with_table() uses bare table.add()
to append vector entries. When commits are re-vectorized after
reindexing, entries with the same git_commit_sha accumulate as
duplicates rather than being replaced, and each insertion
creates a new fragment.

Apply the same merge_insert pattern used for the vectors and
lore_indexed_commits tables: key on git_commit_sha so repeated
SHAs are absorbed rather than duplicated.

Existing databases must be rebuilt with --clear to reclaim
the space already wasted by duplicate vector entries.

Fixes: ba422c7feee0 ("Start indexing git commits (database schema change)")
Signed-off-by: Chuck Lever <cel@kernel.org>
---
 src/database/search.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/database/search.rs b/src/database/search.rs
index 72f5ceb..d6c0f38 100644
--- a/src/database/search.rs
+++ b/src/database/search.rs
@@ -2741,7 +2741,12 @@ impl VectorSearchManager {
 
         let batches = vec![Ok(batch)];
         let batch_iterator = RecordBatchIterator::new(batches.into_iter(), schema);
-        commit_vectors_table.add(batch_iterator).execute().await?;
+
+        let mut merge_insert = commit_vectors_table.merge_insert(&["git_commit_sha"]);
+        merge_insert
+            .when_matched_update_all(None)
+            .when_not_matched_insert_all();
+        merge_insert.execute(Box::new(batch_iterator)).await?;
 
         Ok(())
     }

From 10425c66f4686185dceb9ec6ac872ecae90c49c1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@kernel.org>
Date: Thu, 26 Feb 2026 11:28:08 -0500
Subject: [PATCH 07/10] database: Use merge_insert for lore_vectors table

insert_lore_vectors_batch_with_table() uses bare table.add()
to append vector entries. When lore emails are re-vectorized,
entries with the same message_id accumulate as duplicates
rather than being replaced, and each insertion creates a new
fragment. update_lore_vectors() already skips message_ids
present in the table, but a re-vectorization pass (after
model changes or --clear of the vectors table alone) still
produces duplicates.

Apply the same merge_insert pattern used for the other
vector tables: key on message_id so repeated entries are
absorbed rather than duplicated.

Existing databases must be rebuilt with --clear to reclaim
the space already wasted by duplicate vector entries.

Fixes: 01b93990f978 ("semcode: add --lore for email indexing (database schema change)")
Signed-off-by: Chuck Lever <cel@kernel.org>
---
 src/database/search.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/database/search.rs b/src/database/search.rs
index d6c0f38..15a3cda 100644
--- a/src/database/search.rs
+++ b/src/database/search.rs
@@ -3122,7 +3122,12 @@ async fn insert_lore_vectors_batch_with_table(
 
     let batches = vec![Ok(batch)];
     let batch_iterator = RecordBatchIterator::new(batches.into_iter(), schema);
-    lore_vectors_table.add(batch_iterator).execute().await?;
+
+    let mut merge_insert = lore_vectors_table.merge_insert(&["message_id"]);
+    merge_insert
+        .when_matched_update_all(None)
+        .when_not_matched_insert_all();
+    merge_insert.execute(Box::new(batch_iterator)).await?;
 
     Ok(())
 }

From 95bee274421dc2c5eddea22e6de345e0c2988be4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@kernel.org>
Date: Thu, 26 Feb 2026 17:16:41 -0500
Subject: [PATCH 08/10] lore: Drop stale FTS indices before rebuilding them

During lore indexing, FTS indices were created before the
compaction and prune steps of database optimization. The
optimization then rebuilt all FTS indices against the new
compacted layout, orphaning the original set. LanceDB's
prune operation removes old data versions but not old
index data, leaving both sets on disk. On a kernel tree
with several lore archives this wasted roughly 1.5 GB.

Reorder both the --lore and --refresh-lore code paths so
that optimization runs first and FTS index creation
follows.

Additionally, have create_lore_fts_indices() drop any
existing FTS indices, create fresh replacements, and then
prune the table. drop_index() only removes the logical
reference from the manifest; the old index directory
under _indices/ persists as orphaned data until a prune
pass reclaims it. The same is true of directories left
behind by OptimizeAction::Index, which rebuilds all
indices into new directories without removing the prior
set. The trailing prune removes both sources of orphaned
index data.

Signed-off-by: Chuck Lever <cel@kernel.org>
---
 src/bin/index.rs       | 32 +++++++++++------------
 src/database/schema.rs | 59 +++++++++++++++++++++++++++++-------------
 2 files changed, 57 insertions(+), 34 deletions(-)

diff --git a/src/bin/index.rs b/src/bin/index.rs
index 3b0533b..9046673 100644
--- a/src/bin/index.rs
+++ b/src/bin/index.rs
@@ -1080,15 +1080,9 @@ async fn main() -> Result<()> {
                 total_emails_all_archives
             );
 
-            // Create FTS indices for lore table after data is inserted
+            // Optimize before creating FTS indices so that compaction
+            // does not orphan the index data that was just built.
             if total_new_emails > 0 {
-                println!("\nCreating FTS indices for lore table...");
-                match db_manager.create_lore_fts_indices().await {
-                    Ok(_) => println!("FTS indices created successfully"),
-                    Err(e) => eprintln!("Warning: Failed to create FTS indices: {}", e),
-                }
-
-                // Check if optimization is needed after lore indexing
                 match db_manager.check_optimization_health().await {
                     Ok((needs_optimization, message)) => {
                         if needs_optimization {
@@ -1105,6 +1099,12 @@ async fn main() -> Result<()> {
                         error!("Failed to check database health: {}", e);
                     }
                 }
+
+                println!("\nCreating FTS indices for lore table...");
+                match db_manager.create_lore_fts_indices().await {
+                    Ok(_) => println!("FTS indices created successfully"),
+                    Err(e) => eprintln!("Warning: Failed to create FTS indices: {}", e),
+                }
             }
 
             println!("\nTo query this database, run:");
@@ -1248,15 +1248,9 @@ async fn main() -> Result<()> {
                 }
             }
 
-            // Create FTS indices for lore table after data is inserted
+            // Optimize before creating FTS indices so that compaction
+            // does not orphan the index data that was just built.
             if total_new_emails > 0 {
-                println!("\nCreating FTS indices for lore table...");
-                match db_manager.create_lore_fts_indices().await {
-                    Ok(_) => println!("FTS indices created successfully"),
-                    Err(e) => eprintln!("Warning: Failed to create FTS indices: {}", e),
-                }
-
-                // Check if optimization is needed
                 match db_manager.check_optimization_health().await {
                     Ok((needs_optimization, message)) => {
                         if needs_optimization {
@@ -1273,6 +1267,12 @@ async fn main() -> Result<()> {
                         error!("Failed to check database health: {}", e);
                     }
                 }
+
+                println!("\nCreating FTS indices for lore table...");
+                match db_manager.create_lore_fts_indices().await {
+                    Ok(_) => println!("FTS indices created successfully"),
+                    Err(e) => eprintln!("Warning: Failed to create FTS indices: {}", e),
+                }
             }
 
             // Check for new archives available on lore.kernel.org
diff --git a/src/database/schema.rs b/src/database/schema.rs
index def10f6..fdc354d 100644
--- a/src/database/schema.rs
+++ b/src/database/schema.rs
@@ -801,33 +801,34 @@ impl SchemaManager {
             .await
         {
             Ok(_) => tracing::info!("✓ Completed {}", description),
-            Err(e) => tracing::debug!("{} may already exist: {}", description, e),
+            Err(e) => tracing::warn!("Failed to create {}: {}", description, e),
         }
     }
 
     /// Create FTS indices for lore table (must be called after data is inserted)
-    /// Only creates indices if they don't already exist
+    ///
+    /// Drops any existing FTS indices first, creates fresh ones, then
+    /// prunes so that the on-disk index directories from prior data
+    /// layouts are actually removed.
     pub async fn create_lore_fts_indices(&self) -> Result<()> {
         let table = self.connection.open_table("lore").execute().await?;
 
-        // Check if FTS indices already exist by trying to list them
+        // Drop existing FTS indices before recreating them.
+        // drop_index() removes the logical reference but leaves the
+        // old directory under _indices/ as orphaned data; a prune
+        // pass below reclaims that space.
+        use lancedb::index::IndexType;
         let indices: Vec<lancedb::index::IndexConfig> =
             (table.list_indices().await).unwrap_or_default();
-
-        // Check if FTS indices already exist (index_type must be FTS)
-        use lancedb::index::IndexType;
-        let has_fts_indices = indices.iter().any(|idx| {
-            idx.index_type == IndexType::FTS
-                && (idx.columns.contains(&"from".to_string())
-                    || idx.columns.contains(&"subject".to_string())
-                    || idx.columns.contains(&"body".to_string())
-                    || idx.columns.contains(&"recipients".to_string())
-                    || idx.columns.contains(&"symbols".to_string()))
-        });
-
-        if has_fts_indices {
-            tracing::info!("FTS indices already exist for lore table, skipping creation");
-            return Ok(());
+        let mut dropped = false;
+        for idx in &indices {
+            if idx.index_type == IndexType::FTS {
+                tracing::info!("Dropping stale FTS index: {}", idx.name);
+                if let Err(e) = table.drop_index(&idx.name).await {
+                    tracing::warn!("Failed to drop FTS index {}: {}", idx.name, e);
+                }
+                dropped = true;
+            }
         }
 
         // Create FTS indices for text search on all searchable fields in parallel
@@ -848,6 +849,28 @@ impl SchemaManager {
             "Completed creating 5 FTS indices in {:.1}s",
             elapsed.as_secs_f64()
         );
+
+        // Prune orphaned index data left behind by drop_index()
+        // and by OptimizeAction::Index in optimize_single_table(),
+        // which rebuilds all indices (including FTS) into new
+        // directories without removing the old ones.
+        if dropped {
+            tracing::info!("Pruning orphaned index data from lore table...");
+            if let Err(e) = table
+                .optimize(OptimizeAction::Prune {
+                    older_than: Some(
+                        lancedb::table::Duration::try_seconds(0)
+                            .expect("valid duration"),
+                    ),
+                    delete_unverified: Some(true),
+                    error_if_tagged_old_versions: Some(false),
+                })
+                .await
+            {
+                tracing::warn!("Failed to prune lore table after FTS rebuild: {}", e);
+            }
+        }
+
         Ok(())
     }
 

From 7ddc6917e95349b4bc580a1606da2d6572bcbebe Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@kernel.org>
Date: Thu, 26 Feb 2026 17:25:42 -0500
Subject: [PATCH 09/10] lore: Drop the headers column from the lore table

The lore table stored full RFC 5322 headers alongside the
individual fields already extracted into their own columns
(from, date, subject, message_id, in_reply_to, references,
recipients). The only consumer was write_email_as_mbox(),
which emitted the raw header block when producing MBOX
output.

No query path ever searched or filtered on the raw headers
column. The lore_search MCP function provides from_patterns,
subject_patterns, and recipients_patterns, each backed by
FTS indices on the individual columns. A raw-headers search
would be strictly less useful than what already exists.

Reconstruct the header block from the individual columns
at output time instead of storing it. This eliminates a
full copy of every email's headers from the lance data
files and from the FTS body index scans, reducing the
per-email storage footprint of the lore table.

Existing databases are migrated automatically:
migrate_lore_table() detects the column at startup and
drops it via LanceDB schema evolution.

Signed-off-by: Chuck Lever <cel@kernel.org>
---
 docs/schema.md             |  2 --
 src/database/connection.rs | 38 ++------------------------------------
 src/database/schema.rs     | 36 +++++++++++++++++++++++++++++++++++-
 src/database/search.rs     | 10 ++--------
 src/indexer.rs             |  6 ------
 src/lore_writers.rs        | 21 +++++++++++++--------
 src/types.rs               |  1 -
 7 files changed, 52 insertions(+), 62 deletions(-)

diff --git a/docs/schema.md b/docs/schema.md
index a550380..963b005 100644
--- a/docs/schema.md
+++ b/docs/schema.md
@@ -307,7 +307,6 @@ in_reply_to         (Utf8, nullable)     - Message-ID of parent email
 subject             (Utf8, NOT NULL)     - Email subject line
 references          (Utf8, nullable)     - Space-separated Message-IDs of thread ancestors
 recipients          (Utf8, NOT NULL)     - Comma-separated To/Cc recipients
-headers             (Utf8, NOT NULL)     - Full email headers
 body                (Utf8, NOT NULL)     - Email body content
 symbols             (Utf8, NOT NULL)     - JSON array of symbols found in patches/diffs
 ```
@@ -319,7 +318,6 @@ symbols             (Utf8, NOT NULL)     - JSON array of symbols found in patche
 - BTree on `date` (chronological queries)
 - BTree on `in_reply_to` (threading queries)
 - BTree on `references` (threading queries)
-- BTree on `headers` (header searches)
 - **FTS (Full Text Search) on `from`** - Fast keyword search on sender
 - **FTS on `subject`** - Fast keyword search on subject lines
 - **FTS on `body`** - Fast keyword search on email bodies
diff --git a/src/database/connection.rs b/src/database/connection.rs
index f0bd851..00f08f6 100644
--- a/src/database/connection.rs
+++ b/src/database/connection.rs
@@ -3752,7 +3752,6 @@ impl DatabaseManager {
             Field::new("subject", DataType::Utf8, false),
             Field::new("references", DataType::Utf8, true),
             Field::new("recipients", DataType::Utf8, false),
-            Field::new("headers", DataType::Utf8, false),
             Field::new("body", DataType::Utf8, false),
             Field::new("symbols", DataType::Utf8, false),
         ]));
@@ -3801,8 +3800,8 @@ impl DatabaseManager {
                 e
             );
 
-            // Lore emails carry full headers and bodies, so each row
-            // is large compared to code-analysis records.  LanceDB
+            // Lore emails carry full bodies, so each row is large
+            // compared to code-analysis records.  LanceDB
             // merge_insert uses DataFusion's RepartitionExec, whose
             // memory pool can be exhausted by a single oversized
             // RecordBatch.  Insert in sub-batches to bound peak
@@ -3865,7 +3864,6 @@ impl DatabaseManager {
         let mut subjects = Vec::with_capacity(indices.len());
         let mut references_list = Vec::with_capacity(indices.len());
         let mut recipients_list = Vec::with_capacity(indices.len());
-        let mut headers_list = Vec::with_capacity(indices.len());
         let mut bodies = Vec::with_capacity(indices.len());
         let mut symbols_list = Vec::with_capacity(indices.len());
 
@@ -3879,7 +3877,6 @@ impl DatabaseManager {
             subjects.push(email.subject.clone());
             references_list.push(email.references.clone());
             recipients_list.push(email.recipients.clone());
-            headers_list.push(email.headers.clone());
             bodies.push(email.body.clone());
             symbols_list.push(serde_json::to_string(&email.symbols)?);
         }
@@ -3893,7 +3890,6 @@ impl DatabaseManager {
             Arc::new(StringArray::from(subjects)),
             Arc::new(StringArray::from(references_list)),
             Arc::new(StringArray::from(recipients_list)),
-            Arc::new(StringArray::from(headers_list)),
             Arc::new(StringArray::from(bodies)),
             Arc::new(StringArray::from(symbols_list)),
         ];
@@ -3961,10 +3957,6 @@ impl DatabaseManager {
                 .column_by_name("recipients")
                 .ok_or_else(|| anyhow::anyhow!("Missing recipients column"))?
                 .as_string::<i32>();
-            let headers_list = batch
-                .column_by_name("headers")
-                .ok_or_else(|| anyhow::anyhow!("Missing headers column"))?
-                .as_string::<i32>();
             let bodies = batch
                 .column_by_name("body")
                 .ok_or_else(|| anyhow::anyhow!("Missing body column"))?
@@ -3996,7 +3988,6 @@ impl DatabaseManager {
                         Some(references_list.value(i).to_string())
                     },
                     recipients: recipients_list.value(i).to_string(),
-                    headers: headers_list.value(i).to_string(),
                     body: bodies.value(i).to_string(),
                     symbols,
                 };
@@ -4113,10 +4104,6 @@ impl DatabaseManager {
                     .column_by_name("recipients")
                     .ok_or_else(|| anyhow::anyhow!("Missing recipients column"))?
                     .as_string::<i32>();
-                let headers_list = batch
-                    .column_by_name("headers")
-                    .ok_or_else(|| anyhow::anyhow!("Missing headers column"))?
-                    .as_string::<i32>();
                 let bodies = batch
                     .column_by_name("body")
                     .ok_or_else(|| anyhow::anyhow!("Missing body column"))?
@@ -4207,7 +4194,6 @@ impl DatabaseManager {
                             Some(references_list.value(i).to_string())
                         },
                         recipients: recipients_list.value(i).to_string(),
-                        headers: headers_list.value(i).to_string(),
                         body: bodies.value(i).to_string(),
                         symbols,
                     };
@@ -4649,10 +4635,6 @@ impl DatabaseManager {
                 .column_by_name("recipients")
                 .ok_or_else(|| anyhow::anyhow!("Missing recipients column"))?
                 .as_string::<i32>();
-            let headers_list = batch
-                .column_by_name("headers")
-                .ok_or_else(|| anyhow::anyhow!("Missing headers column"))?
-                .as_string::<i32>();
             let bodies = batch
                 .column_by_name("body")
                 .ok_or_else(|| anyhow::anyhow!("Missing body column"))?
@@ -4684,7 +4666,6 @@ impl DatabaseManager {
                     Some(references_list.value(0).to_string())
                 },
                 recipients: recipients_list.value(0).to_string(),
-                headers: headers_list.value(0).to_string(),
                 body: bodies.value(0).to_string(),
                 symbols,
             };
@@ -4764,10 +4745,6 @@ impl DatabaseManager {
                 .column_by_name("recipients")
                 .ok_or_else(|| anyhow::anyhow!("Missing recipients column"))?
                 .as_string::<i32>();
-            let headers_list = batch
-                .column_by_name("headers")
-                .ok_or_else(|| anyhow::anyhow!("Missing headers column"))?
-                .as_string::<i32>();
             let bodies = batch
                 .column_by_name("body")
                 .ok_or_else(|| anyhow::anyhow!("Missing body column"))?
@@ -4806,7 +4783,6 @@ impl DatabaseManager {
                         Some(references_list.value(i).to_string())
                     },
                     recipients: recipients_list.value(i).to_string(),
-                    headers: headers_list.value(i).to_string(),
                     body: bodies.value(i).to_string(),
                     symbols,
                 };
@@ -4892,10 +4868,6 @@ impl DatabaseManager {
                 .column_by_name("recipients")
                 .ok_or_else(|| anyhow::anyhow!("Missing recipients column"))?
                 .as_string::<i32>();
-            let headers_list = batch
-                .column_by_name("headers")
-                .ok_or_else(|| anyhow::anyhow!("Missing headers column"))?
-                .as_string::<i32>();
             let bodies = batch
                 .column_by_name("body")
                 .ok_or_else(|| anyhow::anyhow!("Missing body column"))?
@@ -4927,7 +4899,6 @@ impl DatabaseManager {
                         Some(references_list.value(i).to_string())
                     },
                     recipients: recipients_list.value(i).to_string(),
-                    headers: headers_list.value(i).to_string(),
                     body: bodies.value(i).to_string(),
                     symbols,
                 };
@@ -5023,10 +4994,6 @@ impl DatabaseManager {
                 .column_by_name("recipients")
                 .ok_or_else(|| anyhow::anyhow!("Missing recipients column"))?
                 .as_string::<i32>();
-            let headers_list = batch
-                .column_by_name("headers")
-                .ok_or_else(|| anyhow::anyhow!("Missing headers column"))?
-                .as_string::<i32>();
             let bodies = batch
                 .column_by_name("body")
                 .ok_or_else(|| anyhow::anyhow!("Missing body column"))?
@@ -5058,7 +5025,6 @@ impl DatabaseManager {
                         Some(references_list.value(i).to_string())
                     },
                     recipients: recipients_list.value(i).to_string(),
-                    headers: headers_list.value(i).to_string(),
                     body: bodies.value(i).to_string(),
                     symbols,
                 };
diff --git a/src/database/schema.rs b/src/database/schema.rs
index fdc354d..0efe664 100644
--- a/src/database/schema.rs
+++ b/src/database/schema.rs
@@ -63,6 +63,8 @@ impl SchemaManager {
 
         if !table_names.iter().any(|n| n == "lore") {
             self.create_lore_table().await?;
+        } else {
+            self.migrate_lore_table().await?;
         }
 
         if !table_names.iter().any(|n| n == "lore_indexed_commits") {
@@ -278,7 +280,6 @@ impl SchemaManager {
             Field::new("subject", DataType::Utf8, false),        // Subject line
             Field::new("references", DataType::Utf8, true), // Full list of references (nullable)
             Field::new("recipients", DataType::Utf8, false), // Full list of cc/to recipients
-            Field::new("headers", DataType::Utf8, false), // Email headers (everything before first blank line)
             Field::new("body", DataType::Utf8, false), // Email body (everything after first blank line)
             Field::new("symbols", DataType::Utf8, false), // JSON array of symbols referenced in email
         ]));
@@ -296,6 +297,39 @@ impl SchemaManager {
         Ok(())
     }
 
+    /// Migrate an existing lore table to the current schema.
+    async fn migrate_lore_table(&self) -> Result<()> {
+        let table = self.connection.open_table("lore").execute().await?;
+        let schema = table.schema().await?;
+
+        // Drop the "headers" column if it exists; individual header
+        // fields are stored in their own columns and reconstructed
+        // on demand for MBOX output.
+        if schema.column_with_name("headers").is_some() {
+            tracing::info!("Migrating lore table: dropping 'headers' column");
+            table.drop_columns(&["headers"]).await?;
+
+            // drop_columns() is a schema-only operation; old data
+            // fragments still carry the headers bytes on disk.
+            // Compact to rewrite fragments without the column,
+            // then prune to delete the stale files.
+            tracing::info!("Compacting lore table to reclaim space");
+            match Self::optimize_single_table(&self.connection, "lore").await? {
+                OptimizeOutcome::Optimized => {
+                    tracing::info!("Lore table migration complete");
+                }
+                OptimizeOutcome::Skipped => {
+                    tracing::info!("Lore table compaction skipped (table too small)");
+                }
+                OptimizeOutcome::PartialFailure => {
+                    tracing::warn!("Lore table compaction partially failed");
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     async fn create_lore_indexed_commits_table(&self) -> Result<()> {
         let schema = Arc::new(Schema::new(vec![
             Field::new("git_commit_sha", DataType::Utf8, false),
diff --git a/src/database/search.rs b/src/database/search.rs
index 15a3cda..c1f80dc 100644
--- a/src/database/search.rs
+++ b/src/database/search.rs
@@ -3636,18 +3636,13 @@ impl VectorSearchManager {
                     .as_any()
                     .downcast_ref::<arrow::array::StringArray>()
                     .unwrap();
-                let headers_array = batch
-                    .column(8)
-                    .as_any()
-                    .downcast_ref::<arrow::array::StringArray>()
-                    .unwrap();
                 let body_array = batch
-                    .column(9)
+                    .column(8)
                     .as_any()
                     .downcast_ref::<arrow::array::StringArray>()
                     .unwrap();
                 let symbols_array = batch
-                    .column(10)
+                    .column(9)
                     .as_any()
                     .downcast_ref::<arrow::array::StringArray>()
                     .unwrap();
@@ -3729,7 +3724,6 @@ impl VectorSearchManager {
                                 Some(references_array.value(i).to_string())
                             },
                             recipients: recipients_array.value(i).to_string(),
-                            headers: headers_array.value(i).to_string(),
                             body: body_array.value(i).to_string(),
                             symbols,
                         },
diff --git a/src/indexer.rs b/src/indexer.rs
index 328aa3b..6ca9d54 100644
--- a/src/indexer.rs
+++ b/src/indexer.rs
@@ -415,7 +415,6 @@ pub fn parse_email_from_commit(
     // Split headers and body
     let mut lines = email_content.lines();
     let mut in_headers = true;
-    let mut header_lines = Vec::new();
     let mut body_lines = Vec::new();
     let mut current_header: Option<(String, String)> = None;
 
@@ -430,9 +429,6 @@ pub fn parse_email_from_commit(
                 continue;
             }
 
-            // Store the header line as-is
-            header_lines.push(line);
-
             // Check if this is a continuation line (starts with whitespace)
             if line.starts_with(' ') || line.starts_with('\t') {
                 if let Some((_, ref mut value)) = current_header {
@@ -463,7 +459,6 @@ pub fn parse_email_from_commit(
     }
 
     let recipients = headers.recipients_list.join(", ");
-    let header_text = header_lines.join("\n");
     let body = body_lines.join("\n");
 
     // Extract symbols from any diffs found in the email body
@@ -478,7 +473,6 @@ pub fn parse_email_from_commit(
         subject: headers.subject,
         references: headers.references,
         recipients,
-        headers: header_text,
         body,
         symbols,
     })
diff --git a/src/lore_writers.rs b/src/lore_writers.rs
index 1b97c39..df5472f 100644
--- a/src/lore_writers.rs
+++ b/src/lore_writers.rs
@@ -29,16 +29,21 @@ pub fn write_email_as_mbox(email: &LoreEmailInfo, writer: &mut dyn Write) -> Res
     // Write the "From " separator line
     writeln!(writer, "From {} {}", sender, asctime_date)?;
 
-    // Write the headers (already includes the original message headers)
-    write!(writer, "{}", email.headers)?;
-
-    // Ensure there's a blank line between headers and body
-    if !email.headers.ends_with('\n') {
-        writeln!(writer)?;
+    // Reconstruct RFC 5322 headers from individual fields
+    writeln!(writer, "From: {}", email.from)?;
+    writeln!(writer, "Subject: {}", email.subject)?;
+    writeln!(writer, "Date: {}", email.date)?;
+    writeln!(writer, "Message-ID: {}", email.message_id)?;
+    if let Some(ref in_reply_to) = email.in_reply_to {
+        writeln!(writer, "In-Reply-To: {}", in_reply_to)?;
     }
-    if !email.headers.ends_with("\n\n") {
-        writeln!(writer)?;
+    if let Some(ref references) = email.references {
+        writeln!(writer, "References: {}", references)?;
     }
+    if !email.recipients.is_empty() {
+        writeln!(writer, "To: {}", email.recipients)?;
+    }
+    writeln!(writer)?;
 
     // Write the body
     write!(writer, "{}", email.body)?;
diff --git a/src/types.rs b/src/types.rs
index 2f0982a..2d66d0c 100644
--- a/src/types.rs
+++ b/src/types.rs
@@ -137,7 +137,6 @@ pub struct LoreEmailInfo {
     pub subject: String,             // Subject line
     pub references: Option<String>,  // Full list of References headers (nullable)
     pub recipients: String,          // Full list of To/CC recipients
-    pub headers: String,             // Email headers (everything before first blank line)
     pub body: String,                // Email body (everything after first blank line)
     pub symbols: Vec<String>,        // List of symbols referenced in the email (empty for now)
 }

From 73d703c88b2064b4108809e390dc77531997848f Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@kernel.org>
Date: Fri, 27 Feb 2026 10:24:21 -0500
Subject: [PATCH 10/10] index: Fall back to SEMCODE_JOBS for default thread
 count

The -j flag controls analysis thread count, but there is no
way to set a persistent default without a shell alias. When
-j is not provided, check the SEMCODE_JOBS environment
variable for a thread count before falling back to
auto-detection. The explicit flag always takes precedence.

This is consistent with SEMCODE_BATCH_SIZE, which already
provides an environment-variable override for vectorization
batch size.

Signed-off-by: Chuck Lever <cel@kernel.org>
---
 src/bin/index.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/bin/index.rs b/src/bin/index.rs
index 9046673..a1942d2 100644
--- a/src/bin/index.rs
+++ b/src/bin/index.rs
@@ -963,6 +963,8 @@ async fn main() -> Result<()> {
         }
 
         analysis_threads
+    } else if let Ok(env_jobs) = std::env::var("SEMCODE_JOBS") {
+        env_jobs.parse::<usize>().unwrap_or(0)
     } else {
         0
     };