fix: add local-model concurrency guard and expand model tests

sanikolaev · sanikolaev · commit c2a8dadcc06e · 2026-02-07T17:28:31.000Z
- Serialize causal model predictions to avoid concurrent Qwen crashes
- Add FFI concurrency tests for Qwen and other local models
- Auto-download required models for integration tests
- Document test cache env vars in README
diff --git a/embeddings/README.md b/embeddings/README.md
@@ -14,3 +14,12 @@ cargo build --lib --release
 g++ -o test examples/test.cpp -Ltarget/release -lmanticore_knn_embeddings -I. -lpthread -ldl -std=c++17
 ```
 
+## Testing
+
+Some integration tests download model files into a cache directory if they are missing. You can
+override the cache location with environment variables:
+
+- `MANTICORE_TEST_CACHE`: preferred cache path for tests
+- `MANTICORE_CACHE_PATH`: fallback cache path for tests
+
+If neither is set, tests use `./.cache/manticore` under the repo.
diff --git a/embeddings/src/model/ffi_test.rs b/embeddings/src/model/ffi_test.rs
@@ -6,6 +6,10 @@ use std::ptr;
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::model::local::build_model_info;
+    use std::collections::HashSet;
+    use std::path::PathBuf;
+    use std::sync::{Mutex, OnceLock};
 
     // Helper function to create a C string from Rust string
     fn to_c_string(s: &str) -> CString {
@@ -20,6 +24,97 @@ mod tests {
         }
     }
 
+    fn test_cache_root() -> String {
+        std::env::var("MANTICORE_TEST_CACHE")
+            .or_else(|_| std::env::var("MANTICORE_CACHE_PATH"))
+            .unwrap_or_else(|_| format!("{}/.cache/manticore", env!("CARGO_MANIFEST_DIR")))
+    }
+
+    fn ensure_model_cached(model_id: &str, cache_path: &PathBuf) {
+        static DOWNLOADED: OnceLock<Mutex<HashSet<String>>> = OnceLock::new();
+        let downloaded = DOWNLOADED.get_or_init(|| Mutex::new(HashSet::new()));
+        let mut set = downloaded.lock().expect("model cache lock poisoned");
+        if set.contains(model_id) {
+            return;
+        }
+        std::fs::create_dir_all(cache_path).expect("failed to create model cache directory");
+        build_model_info(cache_path.clone(), model_id, "main")
+            .expect("failed to download model into cache");
+        set.insert(model_id.to_string());
+    }
+
+    fn run_concurrent_ffi_embeddings(model_id: &str) {
+        use std::sync::Arc;
+        use std::thread;
+
+        let model_id = model_id.to_string();
+        let cache_root = test_cache_root();
+        let cache_path_buf = PathBuf::from(&cache_root);
+        ensure_model_cached(&model_id, &cache_path_buf);
+
+        let model_name = to_c_string(&model_id);
+        let cache_path = to_c_string(&cache_root);
+        let api_key = to_c_string("");
+
+        let result = TextModelWrapper::load_model(
+            model_name.as_ptr(),
+            model_name.as_bytes().len(),
+            cache_path.as_ptr(),
+            cache_path.as_bytes().len(),
+            api_key.as_ptr(),
+            api_key.as_bytes().len(),
+            false,
+        );
+
+        if result.model.is_null() {
+            let error_message = if result.error.is_null() {
+                "unknown error".to_string()
+            } else {
+                unsafe {
+                    CStr::from_ptr(result.error)
+                        .to_str()
+                        .unwrap_or("unknown error")
+                        .to_string()
+                }
+            };
+            TextModelWrapper::free_model_result(result);
+            panic!("failed to load model {}: {}", model_id, error_message);
+        }
+
+        let model_ptr = result.model as usize;
+        let start = Arc::new(std::sync::Barrier::new(4));
+        let handles: Vec<_> = (0..3)
+            .map(|i| {
+                let start = Arc::clone(&start);
+                let model_ptr = model_ptr;
+                let model_id = model_id.clone();
+                thread::spawn(move || {
+                    start.wait();
+                    let text = format!("Concurrent embedding test {} - {}", model_id, i);
+                    let item = create_string_item(&text);
+                    let items = [item];
+
+                    // Safety: emulate FFI callers that share a model pointer across threads.
+                    let wrapper = unsafe {
+                        std::mem::transmute::<*mut std::ffi::c_void, TextModelWrapper>(
+                            model_ptr as *mut std::ffi::c_void,
+                        )
+                    };
+                    let vec_result =
+                        TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1);
+                    TextModelWrapper::free_vec_result(vec_result);
+                })
+            })
+            .collect();
+
+        start.wait();
+        for handle in handles {
+            handle.join().unwrap();
+        }
+
+        TextModelWrapper::free_model_result(result);
+    }
+
     #[test]
     fn test_text_model_result_structure() {
         // Test that TextModelResult has the expected structure
@@ -367,4 +462,23 @@ mod tests {
         assert_eq!(options2.api_key, Some("sk-test456".to_string()));
         assert_eq!(options2.use_gpu, None);
     }
+
+    #[test]
+    fn test_concurrent_qwen_embeddings_via_ffi() {
+        run_concurrent_ffi_embeddings("Qwen/Qwen3-Embedding-0.6B");
+    }
+
+    #[test]
+    fn test_concurrent_other_models_via_ffi() {
+        let model_ids = [
+            "sentence-transformers/all-MiniLM-L6-v2",
+            "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            "Locutusque/TinyMistral-248M-v2",
+            "h2oai/embeddinggemma-300m",
+        ];
+
+        for model_id in model_ids {
+            run_concurrent_ffi_embeddings(model_id);
+        }
+    }
 }
diff --git a/embeddings/src/model/local.rs b/embeddings/src/model/local.rs
@@ -20,6 +20,7 @@ use serde_json::Value;
 use std::cell::RefCell;
 use std::error::Error;
 use std::path::PathBuf;
+use std::sync::Mutex;
 use tokenizers::Tokenizer;
 
 /// Model architecture type - determines pooling strategy
@@ -193,6 +194,7 @@ pub struct CausalEmbeddingModel {
     max_input_len: usize,
     hidden_size: usize,
     device: Device,
+    predict_lock: Mutex<()>,
 }
 
 impl CausalEmbeddingModel {
@@ -282,6 +284,7 @@ impl CausalEmbeddingModel {
             max_input_len,
             hidden_size,
             device,
+            predict_lock: Mutex::new(()),
         })
     }
 }
@@ -345,6 +348,15 @@ impl LocalModel {
 
 impl TextModel for LocalModel {
     fn predict(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>, Box<dyn Error>> {
+        let _predict_guard = match self {
+            LocalModel::Causal(m) => Some(
+                m.predict_lock
+                    .lock()
+                    .map_err(|_| LibError::ModelLoadFailed)?,
+            ),
+            LocalModel::Bert(_) => None,
+        };
+
         let (device, max_input_len) = match self {
             LocalModel::Bert(m) => (m.device.clone(), m.max_input_len),
             LocalModel::Causal(m) => (m.device.clone(), m.max_input_len),
diff --git a/embeddings/src/model/local_test.rs b/embeddings/src/model/local_test.rs
@@ -6,15 +6,36 @@ mod tests {
     use crate::model::TextModel;
     use crate::utils::{get_hidden_size, get_max_input_length};
     use approx::assert_abs_diff_eq;
+    use std::collections::HashSet;
     use std::path::PathBuf;
+    use std::sync::{Mutex, OnceLock};
 
     fn check_embedding_properties(embedding: &[f32], expected_len: usize) {
         assert_eq!(embedding.len(), expected_len);
         let norm: f32 = embedding.iter().map(|&x| x * x).sum::<f32>().sqrt();
         assert_abs_diff_eq!(norm, 1.0, epsilon = 1e-6);
     }
 
+    fn ensure_model_cached(model_id: &str, cache_path: &PathBuf) {
+        static DOWNLOADED: OnceLock<Mutex<HashSet<String>>> = OnceLock::new();
+        let downloaded = DOWNLOADED.get_or_init(|| Mutex::new(HashSet::new()));
+        let mut set = downloaded.lock().expect("model cache lock poisoned");
+        if set.contains(model_id) {
+            return;
+        }
+        std::fs::create_dir_all(cache_path).expect("failed to create model cache directory");
+        build_model_info(cache_path.clone(), model_id, "main")
+            .expect("failed to download model into cache");
+        set.insert(model_id.to_string());
+    }
+
     fn test_cache_path() -> PathBuf {
+        if let Ok(path) = std::env::var("MANTICORE_TEST_CACHE") {
+            return PathBuf::from(path);
+        }
+        if let Ok(path) = std::env::var("MANTICORE_CACHE_PATH") {
+            return PathBuf::from(path);
+        }
         PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(".cache/manticore")
     }
 
@@ -325,6 +346,7 @@ mod tests {
     fn test_all_minilm_l6_v2() {
         let model_id = "sentence-transformers/all-MiniLM-L6-v2";
         let cache_path = test_cache_path();
+        ensure_model_cached(model_id, &cache_path);
 
         let test_sentences = [
             "This is a test sentence.",
@@ -343,6 +365,7 @@ mod tests {
     fn test_embedding_consistency() {
         let model_id = "sentence-transformers/all-MiniLM-L6-v2";
         let cache_path = test_cache_path();
+        ensure_model_cached(model_id, &cache_path);
         let local_model = LocalModel::new(model_id, cache_path, false).unwrap();
 
         let sentence = &["This is a test sentence."];
@@ -358,6 +381,7 @@ mod tests {
     fn test_hidden_size() {
         let model_id = "sentence-transformers/all-MiniLM-L6-v2";
         let cache_path = test_cache_path();
+        ensure_model_cached(model_id, &cache_path);
         let local_model = LocalModel::new(model_id, cache_path, false).unwrap();
         assert_eq!(local_model.get_hidden_size(), 384);
     }
@@ -366,6 +390,7 @@ mod tests {
     fn test_max_input_len() {
         let model_id = "sentence-transformers/all-MiniLM-L6-v2";
         let cache_path = test_cache_path();
+        ensure_model_cached(model_id, &cache_path);
         let local_model = LocalModel::new(model_id, cache_path, false).unwrap();
         assert_eq!(local_model.get_max_input_len(), 512);
     }
@@ -375,6 +400,7 @@ mod tests {
         // Integration test for Qwen embedding models
         let model_id = "Qwen/Qwen3-Embedding-0.6B";
         let cache_path = test_cache_path();
+        ensure_model_cached(model_id, &cache_path);
 
         let local_model = LocalModel::new(model_id, cache_path.clone(), false)
             .expect("Qwen model should load successfully");
@@ -395,6 +421,7 @@ mod tests {
         // Integration test for Llama-based embedding models.
         let model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0";
         let cache_path = test_cache_path();
+        ensure_model_cached(model_id, &cache_path);
 
         let local_model =
             LocalModel::new(model_id, cache_path.clone(), false).expect("Llama model should load");
@@ -410,6 +437,7 @@ mod tests {
         // Integration test for Mistral-based embedding models.
         let model_id = "Locutusque/TinyMistral-248M-v2";
         let cache_path = test_cache_path();
+        ensure_model_cached(model_id, &cache_path);
 
         let local_model = LocalModel::new(model_id, cache_path.clone(), false)
             .expect("Mistral model should load");
@@ -424,6 +452,7 @@ mod tests {
         // Integration test for Gemma-based embedding models.
         let model_id = "h2oai/embeddinggemma-300m";
         let cache_path = test_cache_path();
+        ensure_model_cached(model_id, &cache_path);
 
         let local_model =
             LocalModel::new(model_id, cache_path.clone(), false).expect("Gemma model should load");
@@ -438,6 +467,7 @@ mod tests {
         // Test batch processing with Qwen model
         let model_id = "Qwen/Qwen3-Embedding-0.6B";
         let cache_path = test_cache_path();
+        ensure_model_cached(model_id, &cache_path);
 
         let result = LocalModel::new(model_id, cache_path.clone(), false);