-
-
Notifications
You must be signed in to change notification settings - Fork 133
refactor(web): begin implementation of quotient-path convergence via SearchQuotientCluster 🚂 #14949
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: epic/autocorrect
Are you sure you want to change the base?
Changes from all commits
f8aa3d4
6048b3d
d3857a1
cae5c2f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,185 @@ | ||||||||
| /* | ||||||||
| * Keyman is copyright (C) SIL Global. MIT License. | ||||||||
| * | ||||||||
| * Created by jahorton on 2025-10-20 | ||||||||
| * | ||||||||
| * This file defines the predictive-text engine's SearchSpace class, which is used to | ||||||||
| * manage the search-space(s) for text corrections within the engine. | ||||||||
| */ | ||||||||
|
|
||||||||
| import { QueueComparator as Comparator, PriorityQueue } from '@keymanapp/web-utils'; | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we please directly use
Suggested change
|
||||||||
| import { LexicalModelTypes } from '@keymanapp/common-types'; | ||||||||
|
|
||||||||
| import { SearchNode, SearchResult } from './distance-modeler.js'; | ||||||||
| import { generateSpaceSeed, InputSegment, PathResult, SearchQuotientNode } from './search-quotient-node.js'; | ||||||||
|
|
||||||||
| const PATH_QUEUE_COMPARATOR: Comparator<SearchQuotientNode> = (a, b) => { | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| return a.currentCost - b.currentCost; | ||||||||
| } | ||||||||
|
|
||||||||
| // The set of search spaces corresponding to the same 'context' for search. | ||||||||
| // Whenever a wordbreak boundary is crossed, a new instance should be made. | ||||||||
| export class SearchQuotientCluster implements SearchQuotientNode { | ||||||||
| // While most functions can be done directly from SearchSpace, merging and splitting will need access | ||||||||
| // to SearchPath-specific members. It's also cleaner to not allow nested SearchClusters while we | ||||||||
| // haven't worked out support for such a scenario. | ||||||||
| private selectionQueue: PriorityQueue<SearchQuotientNode> = new PriorityQueue(PATH_QUEUE_COMPARATOR); | ||||||||
| readonly spaceId: number; | ||||||||
|
|
||||||||
| // We use an array and not a PriorityQueue b/c batch-heapifying at a single point in time | ||||||||
| // is cheaper than iteratively building a priority queue. | ||||||||
| /** | ||||||||
| * This tracks all paths that have reached the end of a viable input-matching path - even | ||||||||
| * those of lower cost that produce the same correction as other paths. | ||||||||
| * | ||||||||
| * When new input is received, its entries are then used to append edges to the path in order | ||||||||
| * to find potential paths to reach a new viable end. | ||||||||
| */ | ||||||||
| private completedPaths?: SearchNode[] = []; | ||||||||
|
|
||||||||
| /** | ||||||||
| * Acts as a Map that prevents duplicating a correction-search path if reached | ||||||||
| * more than once. | ||||||||
| */ | ||||||||
| protected get processedEdgeSet(): {[pathKey: string]: boolean} { | ||||||||
| return this._processedEdgeSet; | ||||||||
| } | ||||||||
|
|
||||||||
| private _processedEdgeSet?: {[pathKey: string]: boolean} = {}; | ||||||||
|
|
||||||||
| /** | ||||||||
| * Provides a heuristic for the base cost at each depth if the best | ||||||||
| * individual input were taken at that level. | ||||||||
| */ | ||||||||
| readonly lowestPossibleSingleCost: number; | ||||||||
|
|
||||||||
| /** | ||||||||
| * Constructs a fresh SearchSpace instance for used in predictive-text correction | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| * and suggestion searches. | ||||||||
| * @param baseSpaceId | ||||||||
| * @param model | ||||||||
|
Comment on lines
+59
to
+60
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| */ | ||||||||
| constructor(inboundPaths: SearchQuotientNode[]) { | ||||||||
| if(inboundPaths.length == 0) { | ||||||||
| throw new Error("SearchCluster requires an array with at least one SearchPath"); | ||||||||
| } | ||||||||
|
|
||||||||
| let lowestPossibleSingleCost = Number.POSITIVE_INFINITY; | ||||||||
| const firstPath = inboundPaths[0]; | ||||||||
| const inputCount = firstPath.inputCount; | ||||||||
| const codepointLength = firstPath.codepointLength; | ||||||||
| const sourceRangeKey = firstPath.sourceRangeKey; | ||||||||
|
|
||||||||
| for(let path of inboundPaths) { | ||||||||
| if(path.inputCount != inputCount || path.codepointLength != codepointLength) { | ||||||||
| throw new Error(`SearchPath does not share same properties as others in the cluster: inputCount ${path.inputCount} vs ${inputCount}, codepointLength ${path.codepointLength} vs ${codepointLength}`); | ||||||||
| } | ||||||||
|
|
||||||||
| // If there's a source-range key mismatch - via mismatch in count or in actual ID, we have an error. | ||||||||
| if(path.sourceRangeKey != sourceRangeKey) { | ||||||||
| throw new Error(`SearchPath does not share the same source identifiers as others in the cluster`); | ||||||||
| } | ||||||||
|
|
||||||||
| lowestPossibleSingleCost = Math.min(lowestPossibleSingleCost, path.lowestPossibleSingleCost); | ||||||||
| } | ||||||||
|
|
||||||||
| this.spaceId = generateSpaceSeed(); | ||||||||
|
|
||||||||
| this.lowestPossibleSingleCost = lowestPossibleSingleCost; | ||||||||
| this.completedPaths = inboundPaths.flatMap(p => p.previousResults).map(r => r.node); | ||||||||
| this.selectionQueue.enqueueAll(inboundPaths); | ||||||||
|
|
||||||||
| return; | ||||||||
| } | ||||||||
|
|
||||||||
| public get inputCount(): number { | ||||||||
| return this.selectionQueue.peek()?.inputCount ?? 0; | ||||||||
| } | ||||||||
|
|
||||||||
| public get bestExample(): {text: string, p: number} { | ||||||||
| const bestPrefixes = this.selectionQueue.toArray().map(p => p.bestExample); | ||||||||
| return bestPrefixes.reduce((max, curr) => max.p < curr.p ? curr : max); | ||||||||
| } | ||||||||
|
|
||||||||
| public get parents(): SearchQuotientNode[] { | ||||||||
| return this.selectionQueue.toArray().slice(); | ||||||||
| } | ||||||||
|
|
||||||||
| increaseMaxEditDistance() { | ||||||||
| // By extracting the entries from the priority queue and increasing distance outside of it as a batch job, | ||||||||
| // we get an O(N) implementation, rather than the O(N log N) that would result from maintaining the original queue. | ||||||||
| const entries = this.selectionQueue.toArray(); | ||||||||
|
|
||||||||
| entries.forEach((path) => path.increaseMaxEditDistance()); | ||||||||
|
|
||||||||
| // Since we just modified the stored instances, and the costs may have shifted, we need to re-heapify. | ||||||||
| this.selectionQueue = new PriorityQueue<SearchQuotientNode>(PATH_QUEUE_COMPARATOR, entries.slice()); | ||||||||
| } | ||||||||
|
|
||||||||
| /** | ||||||||
| * When true, this indicates that the currently-represented portion of context | ||||||||
| * has fat-finger data available, which itself indicates that the user has | ||||||||
| * corrections enabled. | ||||||||
| */ | ||||||||
| get correctionsEnabled(): boolean { | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| const paths = this.selectionQueue.toArray(); | ||||||||
| // When corrections are disabled, the Web engine will only provide individual Transforms | ||||||||
| // for an input, not a distribution. No distributions means we shouldn't do corrections. | ||||||||
| return !!paths.find(p => p.correctionsEnabled); | ||||||||
| } | ||||||||
|
|
||||||||
| public get currentCost(): number { | ||||||||
| return this.selectionQueue.peek()?.currentCost ?? Number.POSITIVE_INFINITY; | ||||||||
| } | ||||||||
|
|
||||||||
| /** | ||||||||
| * Retrieves the lowest-cost / lowest-distance edge from the selection queue, | ||||||||
| * checks its validity as a correction to the input text, and reports on what | ||||||||
| * sort of result the edge's destination node represents. | ||||||||
| * @returns | ||||||||
| */ | ||||||||
| public handleNextNode(): PathResult { | ||||||||
| const bestPath = this.selectionQueue.dequeue(); | ||||||||
| const currentResult = bestPath.handleNextNode(); | ||||||||
| this.selectionQueue.enqueue(bestPath); | ||||||||
|
|
||||||||
| if(currentResult.type == 'complete') { | ||||||||
| this.completedPaths?.push(currentResult.finalNode); | ||||||||
| currentResult.spaceId = this.spaceId; | ||||||||
| } | ||||||||
|
|
||||||||
| return currentResult; | ||||||||
| } | ||||||||
|
|
||||||||
| public get previousResults(): SearchResult[] { | ||||||||
| return this.completedPaths?.map((n => new SearchResult(n, this.spaceId))) ?? []; | ||||||||
| } | ||||||||
|
|
||||||||
| get model(): LexicalModelTypes.LexicalModel { | ||||||||
| return this.parents[0].model; | ||||||||
| } | ||||||||
|
|
||||||||
| get codepointLength(): number { | ||||||||
| return this.parents[0].codepointLength; | ||||||||
| } | ||||||||
|
|
||||||||
| get inputSegments(): InputSegment[] { | ||||||||
| return this.parents[0].inputSegments; | ||||||||
| }; | ||||||||
|
|
||||||||
| /** | ||||||||
| * Gets a compact string-based representation of `inputRange` that | ||||||||
| * maps compatible token source ranges to each other. | ||||||||
| */ | ||||||||
| get sourceRangeKey(): string { | ||||||||
| return this.parents[0].sourceRangeKey; | ||||||||
| } | ||||||||
|
|
||||||||
| merge(space: SearchQuotientNode): SearchQuotientNode { | ||||||||
| throw new Error('Method not implemented.'); | ||||||||
| } | ||||||||
|
|
||||||||
| split(charIndex: number): [SearchQuotientNode, SearchQuotientNode] { | ||||||||
| throw new Error('Method not implemented.'); | ||||||||
| } | ||||||||
| } | ||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -256,12 +256,8 @@ describe('ContextState', () => { | |
| assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchModule.inputCount, 1); | ||
| // empty transform | ||
| assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchModule.inputCount, 1); | ||
|
|
||
| // if(!newContextMatch.final.tokenization.alignment.canAlign) { | ||
| // assert.fail("context alignment failed"); | ||
| // } | ||
| // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); | ||
| // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); | ||
| assert.isTrue(state.tokenization.tail.searchModule instanceof SearchQuotientSpur); | ||
| assert.deepEqual((state.tokenization.tail.searchModule as SearchQuotientSpur).lastInput, [{sample: { insert: '', deleteLeft: 0 }, p: 1}]); | ||
| }); | ||
|
|
||
| it("properly matches and aligns when whitespace before final empty token is extended", function() { | ||
|
|
@@ -286,8 +282,10 @@ describe('ContextState', () => { | |
| // Two whitespaces, one of which is new! | ||
| const preTail = state.tokenization.tokens[state.tokenization.tokens.length - 2]; | ||
| assert.equal(preTail.searchModule.inputCount, 2); | ||
| assert.deepEqual((preTail.searchModule as SearchQuotientSpur).lastInput, [{sample: transform, p: 1}]); | ||
| assert.deepEqual((preTail.searchModule.parents[0] as SearchQuotientSpur).lastInput, [{sample: transform, p: 1}]); | ||
| assert.equal(state.tokenization.tail.searchModule.inputCount, 1); | ||
| assert.isTrue(state.tokenization.tail.searchModule instanceof SearchQuotientSpur); | ||
| assert.deepEqual((state.tokenization.tail.searchModule as SearchQuotientSpur).lastInput, [{sample: { insert: '', deleteLeft: 0 }, p: 1}]); | ||
| }); | ||
|
|
||
| it("properly matches and aligns when a 'wordbreak' is removed via backspace", function() { | ||
|
|
@@ -304,12 +302,6 @@ describe('ContextState', () => { | |
| let newContextMatch = baseState.analyzeTransition(existingContext, toWrapperDistribution(transform)); | ||
| assert.isOk(newContextMatch?.final); | ||
| assert.deepEqual(newContextMatch?.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); | ||
|
|
||
| // if(!newContextMatch.final.tokenization.alignment.canAlign) { | ||
| // assert.fail("context alignment failed"); | ||
| // } | ||
| // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); | ||
| // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, -2); | ||
| }); | ||
|
|
||
| it("properly matches and aligns when an implied 'wordbreak' occurs (as when following \"'\")", function() { | ||
|
|
@@ -332,12 +324,6 @@ describe('ContextState', () => { | |
| let state = newContextMatch.final; | ||
| assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchModule.inputCount, 1); | ||
| assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchModule.inputCount, 1); | ||
|
|
||
| // if(!newContextMatch.final.tokenization.alignment.canAlign) { | ||
| // assert.fail("context alignment failed"); | ||
| // } | ||
| // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); | ||
| // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 1); | ||
| }) | ||
|
|
||
| // Needs improved context-state management (due to 2x tokens) | ||
|
|
@@ -398,12 +384,6 @@ describe('ContextState', () => { | |
| assert.equal( | ||
| state.tokenization.tokens[state.tokenization.tokens.length - 1].searchModule.inputCount, 1 | ||
| ); | ||
|
|
||
| // if(!newContextMatch.final.tokenization.alignment.canAlign) { | ||
| // assert.fail("context alignment failed"); | ||
| // } | ||
| // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); | ||
| // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); | ||
| }); | ||
|
|
||
| it("properly matches and aligns when tail token is modified AND a 'wordbreak' is added'", function() { | ||
|
|
@@ -427,15 +407,9 @@ describe('ContextState', () => { | |
| let state = newContextMatch.final; | ||
| assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchModule.inputCount, 1); | ||
| assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchModule.inputCount, 1); | ||
|
|
||
| // if(!newContextMatch.final.tokenization.alignment.canAlign) { | ||
| // assert.fail("context alignment failed"); | ||
| // } | ||
| // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); | ||
| // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); | ||
| }); | ||
|
|
||
| it('handles case where tail token is split into three rather than two', function() { | ||
| it.skip('handles case where tail token is split into three rather than two', function() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess the skipped tests (in this and the other files) will be re-enabled in one of the following PRs? |
||
| let baseContext = models.tokenize(defaultBreaker, { | ||
| left: "text'", startOfBuffer: true, endOfBuffer: true | ||
| }); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -561,7 +561,7 @@ describe('ContextTokenization', function() { | |
| } | ||
| }); | ||
|
|
||
| it('handles case that triggers a token merge: can+\'+t', () => { | ||
| it.skip('handles case that triggers a token merge: can+\'+t', () => { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO: double-check - is this still valid? I thought I reworked things to be supported at this point; this change may be a holdover from a previous implementation strategy. |
||
| const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\'']; | ||
| const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); | ||
|
|
||
|
|
@@ -625,7 +625,7 @@ describe('ContextTokenization', function() { | |
| }); | ||
| }); | ||
|
|
||
| it('handles case that triggers a token split: can\' +. => can, \', .', () => { | ||
| it.skip('handles case that triggers a token split: can\' +. => can, \', .', () => { | ||
| const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\'']; | ||
| const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't this be ?