feat(web): implement search-path splitting for multi-tokenization conditions

jahorton · jahorton · commit 26d97fb3ecee · 2025-10-20T13:58:06.000-05:00
This is the most crucial aspect needed to properly model token-splitting once we start implementing whitespace fat-fingering.

Build-bot: skip build:web
Test-bot: skip
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts
@@ -288,6 +288,11 @@ export class ContextToken {
     // Assumption:  if we're splitting a token, it's not whitespace - and
     // neither are the spun-off tokens.  Thus, we don't set the .isWhitespace
     // flag field.
+    //
+    // Proper splitting with multi-tokenization:  may yield multiple variants of
+    // the requested token count, all of which could be seen as valid.
+    //
+    // Depends on how the SearchSpace splits.
     throw new Error("Temporarily unimplemented");
     // const tokensFromSplit: ContextToken[] = [];
 
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
@@ -544,6 +544,9 @@ export class ContextTokenization {
       if(splits[0]?.input.index == i) {
         // do a split!
         const split = splits.shift();
+        // Proper splitting with multi-tokenization:  may yield multiple
+        // variants of the requested token count, all of which could be seen as
+        // valid.
         const splitResults = baseTokenization[i].split(split, lexicalModel);
         const resultStack = splitResults.reverse();
         while(resultStack.length > 0) {
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts
@@ -76,8 +76,7 @@ export class SearchCluster implements SearchSpace {
       this.selectionQueue.enqueueAll(inboundPaths);
     } else {
       const model = arg2 as LexicalModel;
-      const rootNode = new SearchNode(model.traverseFromRoot(), this.spaceId, (s) => model.toKey(s));
-      const rootPath = new SearchPath(rootNode);
+      const rootPath = new SearchPath(model);
       this.selectionQueue.enqueue(rootPath);
     }
 
@@ -168,4 +167,192 @@ export class SearchCluster implements SearchSpace {
   public stopTrackingResults() {
     delete this.completedPaths;
   }
+
+    public get parents(): ReadonlyArray<SearchPath> {
+    return this.selectionQueue.toArray();
+  }
+
+  public get codepointLength(): number {
+    return this.parents?.[0].codepointLength ?? 0;
+  }
+
+  // public merge(space: SearchSpace): SearchSpace {
+  //   // just... iterate through entries to construct an extended version of THIS
+  //   // search-space.
+  //   // - though... we aren't actually set up to... DO that, are we?
+  //   //   - ... .inputs?  That might actually work!
+  //   // - issue:  merging previously split inputs / input paths.  How to identify those?
+  //   //   - ** wait:  they share the same transform ID on both sides! **
+  //   //   - alternative thought:  could demark a source path spaceID on split-off paths?
+  //   //     - or... maybe just a 'split ID'?
+  //   //     - so... that's enough, right?  For our purposes?
+  //   //     - but split + split again are not technically impossible...
+  //   //     - and even then, isn't that actually overcomplicating things?
+  //   //
+  //   // Needs a new spaceID, of course - at each appended step, to be clear.
+  //   //
+  //   //
+  //   // spaceID => SearchSpace, it would seem.  Just have the token use the ID
+  //   // from SearchSpace / SearchEdge... wait.  SearchEdges with same ID... that's
+  //   // only really available from SearchSpace.  May need to brainstorm that a bit.
+  //   // - current design WAS to do the combination on the State level.
+  //   // ... what if SearchSpace re-maps the search path stateIDs to a combined stateID
+  //   // that it emits?  Then we don't need to worry about micromanaging search path
+  //   // IDs given how they'll be constructed.
+  // }
+
+  public split(charIndex: number, model: LexicalModel): [SearchSpace, SearchSpace][] {
+    return this._split(charIndex, model, new Map());
+  }
+
+  // splitCache:
+  // - key:  id of original search path being split
+  // - value's index:  the number of preserved codepoints
+  // - value's instance at the index:  the spun-off search space
+  private _split(
+    charIndex: number,
+    model: LexicalModel,
+    splitCache: Map<number, {head: SearchCluster, tail: SearchCluster} []>
+  ): [SearchCluster, SearchCluster][] {
+    if(this.codepointLength == charIndex) {
+      console.log('a');
+      // If we're splitting at the tail end of an existing space, just re-use
+      // the space and pass along an empty one for the end.
+      return [[this, new SearchCluster(model)]];
+    }
+
+    // Ensure common split-ancestors still resolve to the same entity.
+    const componentPaths = this.selectionQueue.toArray();
+    let baseResultSet: [SearchCluster, SearchCluster][] = [];
+
+    const deduplicateSplitResults = (results: [SearchCluster, SearchCluster][]) => {
+      // Re-merge paths that converge to the same point.
+      const duplicateMap: Map<number, [SearchCluster, SearchCluster][]> = new Map();
+      results.forEach(result => {
+        const headSpaceId = result[0].spaceId;
+        const arr: [SearchCluster, SearchCluster][] = duplicateMap.get(headSpaceId) ?? [];
+        arr.push(result);
+        duplicateMap.set(result[0].spaceId, arr);
+      });
+
+      const finalResults: [SearchCluster, SearchCluster][] = [];
+      for(const splits of duplicateMap.values()) {
+        const headSpace = splits[0][0];
+
+        // const uniqueTailSpaces = [...splits.reduce((set, curr) => {
+        //   if(!set.has(curr[1].spaceId)) {
+        //     set.set(curr[1].spaceId, curr[1]);
+        //   } else {
+        //     console.log('z');
+        //   }
+
+        //   return set;
+        // }, new Map<number, SearchSpace>()).values()];
+
+        const paths = splits.flatMap(split => split[1].selectionQueue.toArray());
+        const tailSpace = new SearchCluster(paths);
+        // const resultPaths: [SearchSpace, SearchSpace][] = uniqueTailSpaces.map((tailSpace) => ([headSpace, tailSpace]));
+
+        // resultPaths.forEach(entry => finalResults.push(entry));
+        finalResults.push([headSpace, tailSpace]);
+      }
+
+      return finalResults;
+    }
+
+    const pathFiltering = componentPaths.reduce((filtering, path) => {
+      if(path.codepointLength - path.edgeLength > charIndex) {
+        filtering.inParent.push(path);
+      } else {
+        filtering.inCurrent.push(path);
+      }
+
+      return filtering;
+    }, { inParent: [] as SearchPath[], inCurrent: [] as SearchPath[]})
+
+    // should filter all that meet the condition (and those that don't)
+    if(pathFiltering.inParent.length > 0) {
+      const parentResults = pathFiltering.inParent.flatMap((path) => {
+        console.log(`b - ${path.bestExample.text}`);
+        // TODO:  resolve!
+        const results = (path.parents[0] as SearchCluster)._split(charIndex, model, splitCache);
+
+        return results.map((results) => {
+          const tailSpace = new SearchCluster([results[1].addInput([...path.inputs], path.bestProbInEdge)]);
+          results[1] = tailSpace;
+          return results;
+        });
+      });
+
+      baseResultSet = parentResults;
+    }
+
+    // Re: space IDs - we can't reuse data for anything we're reconstructing
+    // after the split point. Original space IDs on the left-hand side may
+    // remain unaltered, but right-hand needs to be re-built from scratch, in
+    // new SearchPaths / SearchSpaces.
+    //
+    // We can optimize how many new spaces/paths we create for the right-hand
+    // side, though:  each starting the same count in, at the same input-offset
+    // position, should be safe to amalgamate.
+    const pathResults: [SearchCluster, SearchCluster][] = pathFiltering.inCurrent.map((path) => {
+      const parentSpace = path.parents[0] ?? new SearchCluster(model);
+      const pathStartIndex = path.codepointLength - path.edgeLength;
+      if(path.codepointLength - path.edgeLength == charIndex) {
+        console.log(`c - ${path.bestExample.text}`);
+        // yay, great case!  Splits cleanly on the boundary BEFORE this path, at
+        // its start.
+        //
+        // parentSpace is thus the END of the prior token.
+        // Start a new one with the current Path.
+        // return [parentSpace, new SearchSpace(/* new spaceId */, path /* reconstructed, now space ID */)];
+        const newPath = new SearchCluster(model).addInput([...path.inputs], path.bestProbInEdge);
+        return [
+          parentSpace instanceof SearchPath ? new SearchCluster([parentSpace]) : parentSpace,
+          new SearchCluster([newPath])
+        ] as [SearchCluster, SearchCluster];
+      } else {
+        console.log(`d - ${path.bestExample.text}`);
+        // OK, so we need to actually split this path in twain.
+        const pathCharIndex = charIndex - pathStartIndex;
+        const results = path.split(pathCharIndex, model);
+        console.log(`pathId: ${path.spaceId} - ${splitCache.has(path.spaceId) ? 'found' : 'not found'}`);
+
+        const pathSplitCacheArray = splitCache.get(path.spaceId) ?? [];
+        splitCache.set(path.spaceId, pathSplitCacheArray);
+
+        const newHeadSpace = pathSplitCacheArray[pathCharIndex]?.head ?? new SearchCluster([new SearchPath(parentSpace, [...results[0].inputs], path.bestProbInEdge)]);
+        const newTailSpace = pathSplitCacheArray[pathCharIndex]?.tail ?? new SearchCluster([new SearchCluster(model).addInput([...results[1].inputs], path.bestProbInEdge)]);
+
+        pathSplitCacheArray[pathCharIndex] = {
+          head: newHeadSpace,
+          tail: newTailSpace
+        }
+        return [newHeadSpace, newTailSpace];
+      }
+    });
+
+    baseResultSet = pathResults.concat(baseResultSet);
+
+    // From pathResults:
+    // - LHS deduplicate:  if same spaceIDs appear on left-hand side, they're the same space;
+    //   we likely split at the same pointt
+    // - RHS:  check search depth + offset position
+    //   - order by input set likelihood
+    //   - replace other path variants with that
+    //
+    // Finally, deduplicate the tuples as much as possible.
+    // ... wait.  Why do we have multiplicity in the paths?  We need to be able to reduce things
+    // down to just 1 + 1 split token, not multiple in each position.
+    //
+    // ... first stop:  we could just... take the most likely case and ignore the others?
+    // ... in which case, why evaluate ALL paths?
+    // - b/c LHS matches could show up multiple times?
+    //
+    // Can we mitigate these cases with improved output from the wordbreaker(s) - say,
+    // about "ambiguous wordbreak" scenarios?
+
+    console.log(`result count: ${baseResultSet.length}; results ${JSON.stringify(baseResultSet.map(r => ([r[0].bestExample.text, r[1].bestExample.text])))}`);
+    return deduplicateSplitResults(baseResultSet);
+  }
 }
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts
@@ -16,6 +16,7 @@ import { generateSpaceSeed, PathResult, SearchSpace } from './search-space.js';
 import { SearchCluster } from './search-cluster.js';
 
 import Distribution = LexicalModelTypes.Distribution;
+import LexicalModel = LexicalModelTypes.LexicalModel;
 import Transform = LexicalModelTypes.Transform;
 
 export const QUEUE_NODE_COMPARATOR: Comparator<SearchNode> = function(arg1, arg2) {
@@ -26,10 +27,16 @@ export const QUEUE_NODE_COMPARATOR: Comparator<SearchNode> = function(arg1, arg2
 // Whenever a wordbreak boundary is crossed, a new instance should be made.
 export class SearchPath implements SearchSpace {
   private selectionQueue: PriorityQueue<SearchNode> = new PriorityQueue(QUEUE_NODE_COMPARATOR);
-  private inputs?: Distribution<Transform>;
+  private _inputs?: Distribution<Transform>;
+
+  public get inputs(): Distribution<Transform> {
+    return this._inputs;
+  }
 
   readonly rootPath: SearchPath;
 
+  readonly bestProbInEdge: number;
+
   private parentSpace: SearchSpace;
   readonly spaceId: number;
 
@@ -64,19 +71,20 @@ export class SearchPath implements SearchSpace {
    * @param baseSpaceId
    * @param model
    */
-  constructor(node?: SearchNode);
-  constructor(space: SearchCluster, inputs: Distribution<Transform>, bestProbFromSet: number);
-  constructor(arg1?: SearchNode | SearchCluster, inputs?: Distribution<Transform>, bestProbFromSet?: number) {
+  constructor(model?: LexicalModel);
+  constructor(space: SearchSpace, inputs: Distribution<Transform>, bestProbFromSet: number);
+  constructor(arg1?: LexicalModel | SearchSpace, inputs?: Distribution<Transform>, bestProbFromSet?: number) {
     // If we're taking in a pre-constructed search node, it's got an associated,
     // pre-assigned spaceID - so use that.
-    const isExtending = !(arg1 instanceof SearchNode);
-    this.spaceId = isExtending ? generateSpaceSeed() : arg1.spaceId;
+    const isExtending = arg1 instanceof SearchCluster || arg1 instanceof SearchPath;
+    this.spaceId = generateSpaceSeed();
 
     if(isExtending) {
       const parentSpace = arg1;
+      this.bestProbInEdge = bestProbFromSet;
       const logTierCost = -Math.log(bestProbFromSet);
 
-      this.inputs = inputs;
+      this._inputs = inputs;
       this.lowestPossibleSingleCost = parentSpace.lowestPossibleSingleCost + logTierCost;
       this.rootPath = parentSpace.rootPath;
       this.parentSpace = parentSpace;
@@ -86,10 +94,12 @@ export class SearchPath implements SearchSpace {
       return;
     }
 
-    const node = arg1;
-    this.selectionQueue.enqueue(node);
-    this.lowestPossibleSingleCost = 0;
+    const model = arg1 as LexicalModel;
+    const rootNode = new SearchNode(model.traverseFromRoot(), this.spaceId, t => model.toKey(t));
+    this.selectionQueue.enqueue(rootNode);
+    this.lowestPossibleSingleCost = 1;
     this.rootPath = this;
+    this.bestProbInEdge = 1;
   }
 
   /**
@@ -99,9 +109,9 @@ export class SearchPath implements SearchSpace {
     const parentSequences = this.parentSpace?.inputSequences ?? [];
 
     if(parentSequences.length == 0) {
-      return this.inputs ? [[this.inputs]] : [];
+      return this._inputs ? [[this._inputs]] : [];
     } else {
-      return parentSequences.map(s => [...s, this.inputs]);
+      return parentSequences.map(s => [...s, this._inputs]);
     }
   }
 
@@ -113,9 +123,28 @@ export class SearchPath implements SearchSpace {
     }
   }
 
+  public get logTierCost(): number {
+    return -Math.log(this.bestProbInEdge);
+  }
+
+  // TODO:  track as a class property; avoid the need for repeated string calculations.
+  // Or just use the subset and its pre-known length/delete values in some manner.
+  public get edgeLength(): number {
+    const insert = this._inputs?.[0].sample.insert ?? '';
+    return KMWString.length(insert);
+  }
+
+  // TODO:  consider optimizing this; we could certainly precompute these values
+  // rather than recalculating it each time.
+  public get codepointLength(): number {
+    const deleteLeft = this._inputs?.[0].sample.deleteLeft ?? 0;
+    const baseLength = this.parentSpace?.codepointLength ?? 0;
+    return baseLength + this.edgeLength - deleteLeft;
+  }
+
   public get bestExample(): {text: string, p: number} {
     const bestPrefix = this.parentSpace?.bestExample ?? { text: '', p: 1 };
-    const bestLocalInput = this.inputs?.reduce((max, curr) => max.p < curr.p ? curr : max) ?? { sample: { insert: '', deleteLeft: 0 }, p: 1};
+    const bestLocalInput = this._inputs?.reduce((max, curr) => max.p < curr.p ? curr : max) ?? { sample: { insert: '', deleteLeft: 0 }, p: 1};
 
     return {
       text: KMWString.substring(bestPrefix.text, 0, KMWString.length(bestPrefix.text) - bestLocalInput.sample.deleteLeft) + bestLocalInput.sample.insert,
@@ -136,6 +165,49 @@ export class SearchPath implements SearchSpace {
     this.selectionQueue = new PriorityQueue<SearchNode>(QUEUE_NODE_COMPARATOR, entries);
   }
 
+  get parents(): [SearchSpace] {
+    return [this.parentSpace];
+  }
+
+  // ... maaaaybe only call if actually splitting?
+  // charIndex:  index within this.edgeLength where the split may occur.
+  public split(charIndex: number, model: LexicalModel): [SearchPath, SearchPath] {
+    // ... might be calculated from the SearchSpace class?
+    if(charIndex < this.edgeLength) {
+      // TODO:  split!
+      const firstSet: Distribution<Transform> = this._inputs.map((input) => ({
+        // keep insert head
+        // keep deleteLeft
+        sample: {
+          insert: KMWString.substring(input.sample.insert, 0, charIndex),
+          deleteLeft: input.sample.deleteLeft
+        }, p: input.p
+      }));
+
+      const secondSet: Distribution<Transform> = this._inputs.map((input) => ({
+        // keep insert tail
+        // deleteLeft == 0
+        sample: {
+          insert: KMWString.substring(input.sample.insert, charIndex),
+          deleteLeft: 0
+        }, p: input.p
+      }));
+
+      // construct two SearchPath instances based on the two sets!
+      return [
+        new SearchPath(this.parentSpace, firstSet, this.logTierCost),
+        new SearchPath(new SearchPath(model), secondSet, this.logTierCost)
+      ];
+    } else {
+      // this instance = 'first set'
+      // second instance:  empty transforms.
+      //
+      // stopgap:  maybe go ahead and check each input for any that are longer?
+      // won't matter shortly, though.
+      return [this, new SearchPath(model)];
+    }
+  }
+
   get correctionsEnabled(): boolean {
     // When corrections are disabled, the Web engine will only provide individual Transforms
     // for an input, not a distribution.  No distributions means we shouldn't do corrections.
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts
@@ -90,6 +90,12 @@ export interface SearchSpace {
    */
   readonly inputSequences: Distribution<Transform>[][];
 
+  /**
+   * Reports the length in codepoints of corrected text represented by completed
+   * paths from this instance.
+   */
+  readonly codepointLength: number;
+
   /**
    * Determines the best example text representable by this batcher's portion of
    * the correction-search graph and its paths.
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-space.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-space.tests.ts