Skip to content

Commit b52bad2

Browse files
author
alvinttang
committed
fix: WASM — Unicode-safe clause split, recursion limit, O(n) search, consistent capitalize
- Map character counts back to byte offsets instead of using lowercased string offsets (prevents panic on non-ASCII chars like Turkish İ where to_lowercase shifts bytes) - Cap extract_facts recursion at depth 10 to prevent stack overflow in WASM - Replace O(n²) position() lookup in search with enumerate() index - Capitalize is_a facts consistently with lives_in and works_at
1 parent 8825c96 commit b52bad2

1 file changed

Lines changed: 34 additions & 13 deletions

File tree

cortex-wasm/src/lib.rs

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ impl CortexWasm {
8787
let query_lower = query.to_lowercase();
8888
let query_words: Vec<&str> = query_lower.split_whitespace().collect();
8989

90-
let mut scored: Vec<SearchResult> = self.memories.iter().map(|m| {
90+
let mut scored: Vec<SearchResult> = self.memories.iter().enumerate().map(|(i, m)| {
9191
let text_lower = m.text.to_lowercase();
9292
let mut score: f32 = 0.0;
9393

@@ -104,8 +104,7 @@ impl CortexWasm {
104104
}
105105

106106
// Recency boost (newer = higher)
107-
let idx = self.memories.iter().position(|x| x.id == m.id).unwrap_or(0);
108-
let recency = idx as f32 / self.memories.len().max(1) as f32;
107+
let recency = i as f32 / self.memories.len().max(1) as f32;
109108
score += recency * 0.2;
110109

111110
SearchResult {
@@ -191,22 +190,44 @@ impl CortexWasm {
191190
/// without breaking values that contain "and" ("Research and Development").
192191
/// Recurses for 3+ clauses. Accepts "I" prefix in second clause.
193192
fn extract_facts(&mut self, text: &str) {
193+
self.extract_facts_inner(text, 0);
194+
}
195+
196+
fn extract_facts_inner(&mut self, text: &str, depth: u8) {
197+
// Guard against unbounded recursion (crafted input with many " and work at " repetitions)
198+
if depth >= 10 {
199+
self.extract_single(text.trim());
200+
return;
201+
}
202+
194203
let verb_prefixes = [
195204
"work at ", "work for ", "i work at ", "i work for ",
196205
"i'm a ", "i am a ", "i'm an ", "i am an ",
197206
"live in ", "i live in ", "i'm based in ", "i am based in ",
198207
"based in ",
199208
];
200209

201-
// Scan ALL " and " / " And " / " AND " positions to find clause boundaries.
202-
// Search in original text to avoid Unicode byte offset mismatch.
203-
let lower = text.to_lowercase();
210+
// Search for " and " case-insensitively by scanning the original text.
211+
// We avoid lowercasing the whole string and using its byte offsets, because
212+
// to_lowercase() can change byte lengths (e.g. Turkish İ → i̇).
213+
let bytes = text.as_bytes();
204214
let mut search_from = 0;
205-
while let Some(rel_pos) = lower[search_from..].find(" and ") {
206-
let pos = search_from + rel_pos;
207-
// Verify pos is valid in original text (ASCII " and " guarantees this for text before it,
208-
// but lowercasing can shift bytes for chars like İ→i̇. Use original text search as fallback.)
209-
if pos + 5 > text.len() { break; }
215+
while search_from + 5 <= bytes.len() {
216+
let rest = &text[search_from..];
217+
// Find next " and " (case-insensitive) in the original string
218+
let rel_pos = match rest.to_lowercase().find(" and ") {
219+
Some(p) => p,
220+
None => break,
221+
};
222+
// Map lowered offset back to original: since " and " is pure ASCII,
223+
// we need the character count up to rel_pos to find the right byte offset.
224+
let orig_char_pos = rest[..rel_pos].chars().count();
225+
let orig_byte_pos: usize = rest.chars().take(orig_char_pos).map(|c| c.len_utf8()).sum();
226+
let pos = search_from + orig_byte_pos;
227+
// Verify the next 5 bytes in the original are " and " (case-insensitive)
228+
if pos + 5 > text.len() || !text.is_char_boundary(pos) || !text.is_char_boundary(pos + 5) {
229+
break;
230+
}
210231
let after = text[pos + 5..].trim_start().to_lowercase();
211232
if verb_prefixes.iter().any(|p| after.starts_with(p)) {
212233
let first = text[..pos].trim();
@@ -222,7 +243,7 @@ impl CortexWasm {
222243
} else {
223244
second.to_string()
224245
};
225-
self.extract_facts(&normalized);
246+
self.extract_facts_inner(&normalized, depth + 1);
226247
return;
227248
}
228249
search_from = pos + 5;
@@ -259,7 +280,7 @@ impl CortexWasm {
259280
if let Some(rest) = lower.strip_prefix(pattern) {
260281
let obj = rest.split(&[',', '.', '!', '?'][..]).next().unwrap_or("").trim();
261282
if !obj.is_empty() {
262-
self.add_fact("User", "is_a", obj, 0.80);
283+
self.add_fact("User", "is_a", &capitalize(obj), 0.80);
263284
}
264285
}
265286
}

0 commit comments

Comments
 (0)