Skip to content

Commit 26d97fb

Browse files
committed
feat(web): implement search-path splitting for multi-tokenization conditions
This is the most crucial aspect needed to properly model token-splitting once we start implementing whitespace fat-fingering. Build-bot: skip build:web Test-bot: skip
1 parent a03bc15 commit 26d97fb

6 files changed

Lines changed: 699 additions & 21 deletions

File tree

web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,11 @@ export class ContextToken {
288288
// Assumption: if we're splitting a token, it's not whitespace - and
289289
// neither are the spun-off tokens. Thus, we don't set the .isWhitespace
290290
// flag field.
291+
//
292+
// Proper splitting with multi-tokenization: may yield multiple variants of
293+
// the requested token count, all of which could be seen as valid.
294+
//
295+
// Depends on how the SearchSpace splits.
291296
throw new Error("Temporarily unimplemented");
292297
// const tokensFromSplit: ContextToken[] = [];
293298

web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,9 @@ export class ContextTokenization {
544544
if(splits[0]?.input.index == i) {
545545
// do a split!
546546
const split = splits.shift();
547+
// Proper splitting with multi-tokenization: may yield multiple
548+
// variants of the requested token count, all of which could be seen as
549+
// valid.
547550
const splitResults = baseTokenization[i].split(split, lexicalModel);
548551
const resultStack = splitResults.reverse();
549552
while(resultStack.length > 0) {

web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts

Lines changed: 189 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,7 @@ export class SearchCluster implements SearchSpace {
7676
this.selectionQueue.enqueueAll(inboundPaths);
7777
} else {
7878
const model = arg2 as LexicalModel;
79-
const rootNode = new SearchNode(model.traverseFromRoot(), this.spaceId, (s) => model.toKey(s));
80-
const rootPath = new SearchPath(rootNode);
79+
const rootPath = new SearchPath(model);
8180
this.selectionQueue.enqueue(rootPath);
8281
}
8382

@@ -168,4 +167,192 @@ export class SearchCluster implements SearchSpace {
168167
public stopTrackingResults() {
169168
delete this.completedPaths;
170169
}
170+
171+
public get parents(): ReadonlyArray<SearchPath> {
172+
return this.selectionQueue.toArray();
173+
}
174+
175+
public get codepointLength(): number {
176+
return this.parents?.[0].codepointLength ?? 0;
177+
}
178+
179+
// public merge(space: SearchSpace): SearchSpace {
180+
// // just... iterate through entries to construct an extended version of THIS
181+
// // search-space.
182+
// // - though... we aren't actually set up to... DO that, are we?
183+
// // - ... .inputs? That might actually work!
184+
// // - issue: merging previously split inputs / input paths. How to identify those?
185+
// // - ** wait: they share the same transform ID on both sides! **
186+
// // - alternative thought: could demark a source path spaceID on split-off paths?
187+
// // - or... maybe just a 'split ID'?
188+
// // - so... that's enough, right? For our purposes?
189+
// // - but split + split again are not technically impossible...
190+
// // - and even then, isn't that actually overcomplicating things?
191+
// //
192+
// // Needs a new spaceID, of course - at each appended step, to be clear.
193+
// //
194+
// //
195+
// // spaceID => SearchSpace, it would seem. Just have the token use the ID
196+
// // from SearchSpace / SearchEdge... wait. SearchEdges with same ID... that's
197+
// // only really available from SearchSpace. May need to brainstorm that a bit.
198+
// // - current design WAS to do the combination on the State level.
199+
// // ... what if SearchSpace re-maps the search path stateIDs to a combined stateID
200+
// // that it emits? Then we don't need to worry about micromanaging search path
201+
// // IDs given how they'll be constructed.
202+
// }
203+
204+
public split(charIndex: number, model: LexicalModel): [SearchSpace, SearchSpace][] {
205+
return this._split(charIndex, model, new Map());
206+
}
207+
208+
// splitCache:
209+
// - key: id of original search path being split
210+
// - value's index: the number of preserved codepoints
211+
// - value's instance at the index: the spun-off search space
212+
private _split(
213+
charIndex: number,
214+
model: LexicalModel,
215+
splitCache: Map<number, {head: SearchCluster, tail: SearchCluster} []>
216+
): [SearchCluster, SearchCluster][] {
217+
if(this.codepointLength == charIndex) {
218+
console.log('a');
219+
// If we're splitting at the tail end of an existing space, just re-use
220+
// the space and pass along an empty one for the end.
221+
return [[this, new SearchCluster(model)]];
222+
}
223+
224+
// Ensure common split-ancestors still resolve to the same entity.
225+
const componentPaths = this.selectionQueue.toArray();
226+
let baseResultSet: [SearchCluster, SearchCluster][] = [];
227+
228+
const deduplicateSplitResults = (results: [SearchCluster, SearchCluster][]) => {
229+
// Re-merge paths that converge to the same point.
230+
const duplicateMap: Map<number, [SearchCluster, SearchCluster][]> = new Map();
231+
results.forEach(result => {
232+
const headSpaceId = result[0].spaceId;
233+
const arr: [SearchCluster, SearchCluster][] = duplicateMap.get(headSpaceId) ?? [];
234+
arr.push(result);
235+
duplicateMap.set(result[0].spaceId, arr);
236+
});
237+
238+
const finalResults: [SearchCluster, SearchCluster][] = [];
239+
for(const splits of duplicateMap.values()) {
240+
const headSpace = splits[0][0];
241+
242+
// const uniqueTailSpaces = [...splits.reduce((set, curr) => {
243+
// if(!set.has(curr[1].spaceId)) {
244+
// set.set(curr[1].spaceId, curr[1]);
245+
// } else {
246+
// console.log('z');
247+
// }
248+
249+
// return set;
250+
// }, new Map<number, SearchSpace>()).values()];
251+
252+
const paths = splits.flatMap(split => split[1].selectionQueue.toArray());
253+
const tailSpace = new SearchCluster(paths);
254+
// const resultPaths: [SearchSpace, SearchSpace][] = uniqueTailSpaces.map((tailSpace) => ([headSpace, tailSpace]));
255+
256+
// resultPaths.forEach(entry => finalResults.push(entry));
257+
finalResults.push([headSpace, tailSpace]);
258+
}
259+
260+
return finalResults;
261+
}
262+
263+
const pathFiltering = componentPaths.reduce((filtering, path) => {
264+
if(path.codepointLength - path.edgeLength > charIndex) {
265+
filtering.inParent.push(path);
266+
} else {
267+
filtering.inCurrent.push(path);
268+
}
269+
270+
return filtering;
271+
}, { inParent: [] as SearchPath[], inCurrent: [] as SearchPath[]})
272+
273+
// should filter all that meet the condition (and those that don't)
274+
if(pathFiltering.inParent.length > 0) {
275+
const parentResults = pathFiltering.inParent.flatMap((path) => {
276+
console.log(`b - ${path.bestExample.text}`);
277+
// TODO: resolve!
278+
const results = (path.parents[0] as SearchCluster)._split(charIndex, model, splitCache);
279+
280+
return results.map((results) => {
281+
const tailSpace = new SearchCluster([results[1].addInput([...path.inputs], path.bestProbInEdge)]);
282+
results[1] = tailSpace;
283+
return results;
284+
});
285+
});
286+
287+
baseResultSet = parentResults;
288+
}
289+
290+
// Re: space IDs - we can't reuse data for anything we're reconstructing
291+
// after the split point. Original space IDs on the left-hand side may
292+
// remain unaltered, but right-hand needs to be re-built from scratch, in
293+
// new SearchPaths / SearchSpaces.
294+
//
295+
// We can optimize how many new spaces/paths we create for the right-hand
296+
// side, though: each starting the same count in, at the same input-offset
297+
// position, should be safe to amalgamate.
298+
const pathResults: [SearchCluster, SearchCluster][] = pathFiltering.inCurrent.map((path) => {
299+
const parentSpace = path.parents[0] ?? new SearchCluster(model);
300+
const pathStartIndex = path.codepointLength - path.edgeLength;
301+
if(path.codepointLength - path.edgeLength == charIndex) {
302+
console.log(`c - ${path.bestExample.text}`);
303+
// yay, great case! Splits cleanly on the boundary BEFORE this path, at
304+
// its start.
305+
//
306+
// parentSpace is thus the END of the prior token.
307+
// Start a new one with the current Path.
308+
// return [parentSpace, new SearchSpace(/* new spaceId */, path /* reconstructed, now space ID */)];
309+
const newPath = new SearchCluster(model).addInput([...path.inputs], path.bestProbInEdge);
310+
return [
311+
parentSpace instanceof SearchPath ? new SearchCluster([parentSpace]) : parentSpace,
312+
new SearchCluster([newPath])
313+
] as [SearchCluster, SearchCluster];
314+
} else {
315+
console.log(`d - ${path.bestExample.text}`);
316+
// OK, so we need to actually split this path in twain.
317+
const pathCharIndex = charIndex - pathStartIndex;
318+
const results = path.split(pathCharIndex, model);
319+
console.log(`pathId: ${path.spaceId} - ${splitCache.has(path.spaceId) ? 'found' : 'not found'}`);
320+
321+
const pathSplitCacheArray = splitCache.get(path.spaceId) ?? [];
322+
splitCache.set(path.spaceId, pathSplitCacheArray);
323+
324+
const newHeadSpace = pathSplitCacheArray[pathCharIndex]?.head ?? new SearchCluster([new SearchPath(parentSpace, [...results[0].inputs], path.bestProbInEdge)]);
325+
const newTailSpace = pathSplitCacheArray[pathCharIndex]?.tail ?? new SearchCluster([new SearchCluster(model).addInput([...results[1].inputs], path.bestProbInEdge)]);
326+
327+
pathSplitCacheArray[pathCharIndex] = {
328+
head: newHeadSpace,
329+
tail: newTailSpace
330+
}
331+
return [newHeadSpace, newTailSpace];
332+
}
333+
});
334+
335+
baseResultSet = pathResults.concat(baseResultSet);
336+
337+
// From pathResults:
338+
// - LHS deduplicate: if same spaceIDs appear on left-hand side, they're the same space;
339+
// we likely split at the same pointt
340+
// - RHS: check search depth + offset position
341+
// - order by input set likelihood
342+
// - replace other path variants with that
343+
//
344+
// Finally, deduplicate the tuples as much as possible.
345+
// ... wait. Why do we have multiplicity in the paths? We need to be able to reduce things
346+
// down to just 1 + 1 split token, not multiple in each position.
347+
//
348+
// ... first stop: we could just... take the most likely case and ignore the others?
349+
// ... in which case, why evaluate ALL paths?
350+
// - b/c LHS matches could show up multiple times?
351+
//
352+
// Can we mitigate these cases with improved output from the wordbreaker(s) - say,
353+
// about "ambiguous wordbreak" scenarios?
354+
355+
console.log(`result count: ${baseResultSet.length}; results ${JSON.stringify(baseResultSet.map(r => ([r[0].bestExample.text, r[1].bestExample.text])))}`);
356+
return deduplicateSplitResults(baseResultSet);
357+
}
171358
}

web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts

Lines changed: 85 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import { generateSpaceSeed, PathResult, SearchSpace } from './search-space.js';
1616
import { SearchCluster } from './search-cluster.js';
1717

1818
import Distribution = LexicalModelTypes.Distribution;
19+
import LexicalModel = LexicalModelTypes.LexicalModel;
1920
import Transform = LexicalModelTypes.Transform;
2021

2122
export const QUEUE_NODE_COMPARATOR: Comparator<SearchNode> = function(arg1, arg2) {
@@ -26,10 +27,16 @@ export const QUEUE_NODE_COMPARATOR: Comparator<SearchNode> = function(arg1, arg2
2627
// Whenever a wordbreak boundary is crossed, a new instance should be made.
2728
export class SearchPath implements SearchSpace {
2829
private selectionQueue: PriorityQueue<SearchNode> = new PriorityQueue(QUEUE_NODE_COMPARATOR);
29-
private inputs?: Distribution<Transform>;
30+
private _inputs?: Distribution<Transform>;
31+
32+
public get inputs(): Distribution<Transform> {
33+
return this._inputs;
34+
}
3035

3136
readonly rootPath: SearchPath;
3237

38+
readonly bestProbInEdge: number;
39+
3340
private parentSpace: SearchSpace;
3441
readonly spaceId: number;
3542

@@ -64,19 +71,20 @@ export class SearchPath implements SearchSpace {
6471
* @param baseSpaceId
6572
* @param model
6673
*/
67-
constructor(node?: SearchNode);
68-
constructor(space: SearchCluster, inputs: Distribution<Transform>, bestProbFromSet: number);
69-
constructor(arg1?: SearchNode | SearchCluster, inputs?: Distribution<Transform>, bestProbFromSet?: number) {
74+
constructor(model?: LexicalModel);
75+
constructor(space: SearchSpace, inputs: Distribution<Transform>, bestProbFromSet: number);
76+
constructor(arg1?: LexicalModel | SearchSpace, inputs?: Distribution<Transform>, bestProbFromSet?: number) {
7077
// If we're taking in a pre-constructed search node, it's got an associated,
7178
// pre-assigned spaceID - so use that.
72-
const isExtending = !(arg1 instanceof SearchNode);
73-
this.spaceId = isExtending ? generateSpaceSeed() : arg1.spaceId;
79+
const isExtending = arg1 instanceof SearchCluster || arg1 instanceof SearchPath;
80+
this.spaceId = generateSpaceSeed();
7481

7582
if(isExtending) {
7683
const parentSpace = arg1;
84+
this.bestProbInEdge = bestProbFromSet;
7785
const logTierCost = -Math.log(bestProbFromSet);
7886

79-
this.inputs = inputs;
87+
this._inputs = inputs;
8088
this.lowestPossibleSingleCost = parentSpace.lowestPossibleSingleCost + logTierCost;
8189
this.rootPath = parentSpace.rootPath;
8290
this.parentSpace = parentSpace;
@@ -86,10 +94,12 @@ export class SearchPath implements SearchSpace {
8694
return;
8795
}
8896

89-
const node = arg1;
90-
this.selectionQueue.enqueue(node);
91-
this.lowestPossibleSingleCost = 0;
97+
const model = arg1 as LexicalModel;
98+
const rootNode = new SearchNode(model.traverseFromRoot(), this.spaceId, t => model.toKey(t));
99+
this.selectionQueue.enqueue(rootNode);
100+
this.lowestPossibleSingleCost = 1;
92101
this.rootPath = this;
102+
this.bestProbInEdge = 1;
93103
}
94104

95105
/**
@@ -99,9 +109,9 @@ export class SearchPath implements SearchSpace {
99109
const parentSequences = this.parentSpace?.inputSequences ?? [];
100110

101111
if(parentSequences.length == 0) {
102-
return this.inputs ? [[this.inputs]] : [];
112+
return this._inputs ? [[this._inputs]] : [];
103113
} else {
104-
return parentSequences.map(s => [...s, this.inputs]);
114+
return parentSequences.map(s => [...s, this._inputs]);
105115
}
106116
}
107117

@@ -113,9 +123,28 @@ export class SearchPath implements SearchSpace {
113123
}
114124
}
115125

126+
public get logTierCost(): number {
127+
return -Math.log(this.bestProbInEdge);
128+
}
129+
130+
// TODO: track as a class property; avoid the need for repeated string calculations.
131+
// Or just use the subset and its pre-known length/delete values in some manner.
132+
public get edgeLength(): number {
133+
const insert = this._inputs?.[0].sample.insert ?? '';
134+
return KMWString.length(insert);
135+
}
136+
137+
// TODO: consider optimizing this; we could certainly precompute these values
138+
// rather than recalculating it each time.
139+
public get codepointLength(): number {
140+
const deleteLeft = this._inputs?.[0].sample.deleteLeft ?? 0;
141+
const baseLength = this.parentSpace?.codepointLength ?? 0;
142+
return baseLength + this.edgeLength - deleteLeft;
143+
}
144+
116145
public get bestExample(): {text: string, p: number} {
117146
const bestPrefix = this.parentSpace?.bestExample ?? { text: '', p: 1 };
118-
const bestLocalInput = this.inputs?.reduce((max, curr) => max.p < curr.p ? curr : max) ?? { sample: { insert: '', deleteLeft: 0 }, p: 1};
147+
const bestLocalInput = this._inputs?.reduce((max, curr) => max.p < curr.p ? curr : max) ?? { sample: { insert: '', deleteLeft: 0 }, p: 1};
119148

120149
return {
121150
text: KMWString.substring(bestPrefix.text, 0, KMWString.length(bestPrefix.text) - bestLocalInput.sample.deleteLeft) + bestLocalInput.sample.insert,
@@ -136,6 +165,49 @@ export class SearchPath implements SearchSpace {
136165
this.selectionQueue = new PriorityQueue<SearchNode>(QUEUE_NODE_COMPARATOR, entries);
137166
}
138167

168+
get parents(): [SearchSpace] {
169+
return [this.parentSpace];
170+
}
171+
172+
// ... maaaaybe only call if actually splitting?
173+
// charIndex: index within this.edgeLength where the split may occur.
174+
public split(charIndex: number, model: LexicalModel): [SearchPath, SearchPath] {
175+
// ... might be calculated from the SearchSpace class?
176+
if(charIndex < this.edgeLength) {
177+
// TODO: split!
178+
const firstSet: Distribution<Transform> = this._inputs.map((input) => ({
179+
// keep insert head
180+
// keep deleteLeft
181+
sample: {
182+
insert: KMWString.substring(input.sample.insert, 0, charIndex),
183+
deleteLeft: input.sample.deleteLeft
184+
}, p: input.p
185+
}));
186+
187+
const secondSet: Distribution<Transform> = this._inputs.map((input) => ({
188+
// keep insert tail
189+
// deleteLeft == 0
190+
sample: {
191+
insert: KMWString.substring(input.sample.insert, charIndex),
192+
deleteLeft: 0
193+
}, p: input.p
194+
}));
195+
196+
// construct two SearchPath instances based on the two sets!
197+
return [
198+
new SearchPath(this.parentSpace, firstSet, this.logTierCost),
199+
new SearchPath(new SearchPath(model), secondSet, this.logTierCost)
200+
];
201+
} else {
202+
// this instance = 'first set'
203+
// second instance: empty transforms.
204+
//
205+
// stopgap: maybe go ahead and check each input for any that are longer?
206+
// won't matter shortly, though.
207+
return [this, new SearchPath(model)];
208+
}
209+
}
210+
139211
get correctionsEnabled(): boolean {
140212
// When corrections are disabled, the Web engine will only provide individual Transforms
141213
// for an input, not a distribution. No distributions means we shouldn't do corrections.

web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,12 @@ export interface SearchSpace {
9090
*/
9191
readonly inputSequences: Distribution<Transform>[][];
9292

93+
/**
94+
* Reports the length in codepoints of corrected text represented by completed
95+
* paths from this instance.
96+
*/
97+
readonly codepointLength: number;
98+
9399
/**
94100
* Determines the best example text representable by this batcher's portion of
95101
* the correction-search graph and its paths.

0 commit comments

Comments
 (0)