Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import { LexicalModelTypes } from '@keymanapp/common-types';

import { ClassicalDistanceCalculation } from './classical-calculation.js';
import { ExecutionTimer, STANDARD_TIME_BETWEEN_DEFERS } from './execution-timer.js';
import { PathResult, SearchQuotientNode } from './search-quotient-node.js';
import { SearchQuotientNode } from './search-quotient-node.js';
import { subsetByChar, subsetByInterval, mergeSubset, TransformSubset } from '../transform-subsets.js';
import TransformUtils from '../transformUtils.js';

Expand Down Expand Up @@ -573,9 +573,12 @@ export class SearchNode {

export class SearchResult {
readonly node: SearchNode;
// Supports SearchPath -> SearchSpace remapping.
readonly spaceId: number;

constructor(node: SearchNode) {
constructor(node: SearchNode, spaceId?: number) {
this.node = node;
this.spaceId = spaceId ?? node.spaceId;
}

get inputSequence(): ProbabilityMass<Transform>[] {
Expand Down Expand Up @@ -622,10 +625,6 @@ export class SearchResult {
get finalTraversal(): LexiconTraversal {
return this.node.currentTraversal;
}

get spaceId(): number {
return this.node.spaceId;
}
}

/**
Expand Down Expand Up @@ -655,15 +654,15 @@ export async function *getBestMatches(searchModules: SearchQuotientNode[], timer

// Stage 2: the fun part; actually searching!
do {
const entry: SearchResult = timer.time(() => {
if((priorResultsQueue.peek()?.totalCost ?? Number.POSITIVE_INFINITY) < spaceQueue.peek().currentCost) {
const entry = timer.time(() => {
if((priorResultsQueue.peek()?.totalCost ?? Number.POSITIVE_INFINITY) <= spaceQueue.peek().currentCost) {
const result = priorResultsQueue.dequeue();
currentReturns[result.node.resultKey] = result.node;
return result;
}

let lowestCostSource = spaceQueue.dequeue();
let newResult: PathResult = lowestCostSource.handleNextNode();
const newResult = lowestCostSource.handleNextNode();
spaceQueue.enqueue(lowestCostSource);

if(newResult.type == 'none') {
Expand All @@ -688,7 +687,7 @@ export async function *getBestMatches(searchModules: SearchQuotientNode[], timer
if((currentReturns[node.resultKey]?.currentCost ?? Number.MAX_VALUE) > node.currentCost) {
currentReturns[node.resultKey] = node;
// Do not track yielded time.
return new SearchResult(node);
return new SearchResult(node, newResult.spaceId);
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
/*
* Keyman is copyright (C) SIL Global. MIT License.
*
* Created by jahorton on 2025-10-20
*
* This file defines the predictive-text engine's SearchSpace class, which is used to
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be ?

Suggested change
* This file defines the predictive-text engine's SearchSpace class, which is used to
* This file defines the predictive-text engine's SearchQuotientCluster class, which is used to

* manage the search-space(s) for text corrections within the engine.
*/

import { QueueComparator as Comparator, PriorityQueue } from '@keymanapp/web-utils';
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we please directly use QueueComparator?

Suggested change
import { QueueComparator as Comparator, PriorityQueue } from '@keymanapp/web-utils';
import { QueueComparator, PriorityQueue } from '@keymanapp/web-utils';

import { LexicalModelTypes } from '@keymanapp/common-types';

import { SearchNode, SearchResult } from './distance-modeler.js';
import { generateSpaceSeed, InputSegment, PathResult, SearchQuotientNode } from './search-quotient-node.js';

const PATH_QUEUE_COMPARATOR: Comparator<SearchQuotientNode> = (a, b) => {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const PATH_QUEUE_COMPARATOR: Comparator<SearchQuotientNode> = (a, b) => {
const PATH_QUEUE_COMPARATOR: QueueComparator<SearchQuotientNode> = (a, b) => {

return a.currentCost - b.currentCost;
}

// The set of search spaces corresponding to the same 'context' for search.
// Whenever a wordbreak boundary is crossed, a new instance should be made.
export class SearchQuotientCluster implements SearchQuotientNode {
// While most functions can be done directly from SearchSpace, merging and splitting will need access
// to SearchPath-specific members. It's also cleaner to not allow nested SearchClusters while we
// haven't worked out support for such a scenario.
private selectionQueue: PriorityQueue<SearchQuotientNode> = new PriorityQueue(PATH_QUEUE_COMPARATOR);
readonly spaceId: number;

// We use an array and not a PriorityQueue b/c batch-heapifying at a single point in time
// is cheaper than iteratively building a priority queue.
/**
* This tracks all paths that have reached the end of a viable input-matching path - even
* those of lower cost that produce the same correction as other paths.
*
* When new input is received, its entries are then used to append edges to the path in order
* to find potential paths to reach a new viable end.
*/
private completedPaths?: SearchNode[] = [];

/**
* Acts as a Map that prevents duplicating a correction-search path if reached
* more than once.
*/
protected get processedEdgeSet(): {[pathKey: string]: boolean} {
return this._processedEdgeSet;
}

private _processedEdgeSet?: {[pathKey: string]: boolean} = {};

/**
* Provides a heuristic for the base cost at each depth if the best
* individual input were taken at that level.
*/
readonly lowestPossibleSingleCost: number;

/**
* Constructs a fresh SearchSpace instance for used in predictive-text correction
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* Constructs a fresh SearchSpace instance for used in predictive-text correction
* Constructs a fresh SearchSpace instance for use in predictive-text correction

* and suggestion searches.
* @param baseSpaceId
* @param model
Comment on lines +59 to +60
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* @param baseSpaceId
* @param model
* @param inboundPaths

*/
constructor(inboundPaths: SearchQuotientNode[]) {
if(inboundPaths.length == 0) {
throw new Error("SearchCluster requires an array with at least one SearchPath");
}

let lowestPossibleSingleCost = Number.POSITIVE_INFINITY;
const firstPath = inboundPaths[0];
const inputCount = firstPath.inputCount;
const codepointLength = firstPath.codepointLength;
const sourceRangeKey = firstPath.sourceRangeKey;

for(let path of inboundPaths) {
if(path.inputCount != inputCount || path.codepointLength != codepointLength) {
throw new Error(`SearchPath does not share same properties as others in the cluster: inputCount ${path.inputCount} vs ${inputCount}, codepointLength ${path.codepointLength} vs ${codepointLength}`);
}

// If there's a source-range key mismatch - via mismatch in count or in actual ID, we have an error.
if(path.sourceRangeKey != sourceRangeKey) {
throw new Error(`SearchPath does not share the same source identifiers as others in the cluster`);
}

lowestPossibleSingleCost = Math.min(lowestPossibleSingleCost, path.lowestPossibleSingleCost);
}

this.spaceId = generateSpaceSeed();

this.lowestPossibleSingleCost = lowestPossibleSingleCost;
this.completedPaths = inboundPaths.flatMap(p => p.previousResults).map(r => r.node);
this.selectionQueue.enqueueAll(inboundPaths);

return;
}

public get inputCount(): number {
return this.selectionQueue.peek()?.inputCount ?? 0;
}

public get bestExample(): {text: string, p: number} {
const bestPrefixes = this.selectionQueue.toArray().map(p => p.bestExample);
return bestPrefixes.reduce((max, curr) => max.p < curr.p ? curr : max);
}

public get parents(): SearchQuotientNode[] {
return this.selectionQueue.toArray().slice();
}

increaseMaxEditDistance() {
// By extracting the entries from the priority queue and increasing distance outside of it as a batch job,
// we get an O(N) implementation, rather than the O(N log N) that would result from maintaining the original queue.
const entries = this.selectionQueue.toArray();

entries.forEach((path) => path.increaseMaxEditDistance());

// Since we just modified the stored instances, and the costs may have shifted, we need to re-heapify.
this.selectionQueue = new PriorityQueue<SearchQuotientNode>(PATH_QUEUE_COMPARATOR, entries.slice());
}

/**
* When true, this indicates that the currently-represented portion of context
* has fat-finger data available, which itself indicates that the user has
* corrections enabled.
*/
get correctionsEnabled(): boolean {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
get correctionsEnabled(): boolean {
get correctionsEnabled(): boolean {

const paths = this.selectionQueue.toArray();
// When corrections are disabled, the Web engine will only provide individual Transforms
// for an input, not a distribution. No distributions means we shouldn't do corrections.
return !!paths.find(p => p.correctionsEnabled);
}

public get currentCost(): number {
return this.selectionQueue.peek()?.currentCost ?? Number.POSITIVE_INFINITY;
}

/**
* Retrieves the lowest-cost / lowest-distance edge from the selection queue,
* checks its validity as a correction to the input text, and reports on what
* sort of result the edge's destination node represents.
* @returns
*/
public handleNextNode(): PathResult {
const bestPath = this.selectionQueue.dequeue();
const currentResult = bestPath.handleNextNode();
this.selectionQueue.enqueue(bestPath);

if(currentResult.type == 'complete') {
this.completedPaths?.push(currentResult.finalNode);
currentResult.spaceId = this.spaceId;
}

return currentResult;
}

public get previousResults(): SearchResult[] {
return this.completedPaths?.map((n => new SearchResult(n, this.spaceId))) ?? [];
}

get model(): LexicalModelTypes.LexicalModel {
return this.parents[0].model;
}

get codepointLength(): number {
return this.parents[0].codepointLength;
}

get inputSegments(): InputSegment[] {
return this.parents[0].inputSegments;
};

/**
* Gets a compact string-based representation of `inputRange` that
* maps compatible token source ranges to each other.
*/
get sourceRangeKey(): string {
return this.parents[0].sourceRangeKey;
}

merge(space: SearchQuotientNode): SearchQuotientNode {
throw new Error('Method not implemented.');
}

split(charIndex: number): [SearchQuotientNode, SearchQuotientNode] {
throw new Error('Method not implemented.');
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ export * from './correction/context-tokenization.js';
export { ContextTracker } from './correction/context-tracker.js';
export { ContextTransition } from './correction/context-transition.js';
export * from './correction/distance-modeler.js';
export * from './correction/search-quotient-node.js';
export * from './correction/search-quotient-cluster.js';
export * from './correction/search-quotient-spur.js';
export * from './correction/search-quotient-node.js';
export * from './correction/legacy-quotient-root.js';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,12 +256,8 @@ describe('ContextState', () => {
assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchModule.inputCount, 1);
// empty transform
assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchModule.inputCount, 1);

// if(!newContextMatch.final.tokenization.alignment.canAlign) {
// assert.fail("context alignment failed");
// }
// assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0);
// assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2);
assert.isTrue(state.tokenization.tail.searchModule instanceof SearchQuotientSpur);
assert.deepEqual((state.tokenization.tail.searchModule as SearchQuotientSpur).lastInput, [{sample: { insert: '', deleteLeft: 0 }, p: 1}]);
});

it("properly matches and aligns when whitespace before final empty token is extended", function() {
Expand All @@ -286,8 +282,10 @@ describe('ContextState', () => {
// Two whitespaces, one of which is new!
const preTail = state.tokenization.tokens[state.tokenization.tokens.length - 2];
assert.equal(preTail.searchModule.inputCount, 2);
assert.deepEqual((preTail.searchModule as SearchQuotientSpur).lastInput, [{sample: transform, p: 1}]);
assert.deepEqual((preTail.searchModule.parents[0] as SearchQuotientSpur).lastInput, [{sample: transform, p: 1}]);
assert.equal(state.tokenization.tail.searchModule.inputCount, 1);
assert.isTrue(state.tokenization.tail.searchModule instanceof SearchQuotientSpur);
assert.deepEqual((state.tokenization.tail.searchModule as SearchQuotientSpur).lastInput, [{sample: { insert: '', deleteLeft: 0 }, p: 1}]);
});

it("properly matches and aligns when a 'wordbreak' is removed via backspace", function() {
Expand All @@ -304,12 +302,6 @@ describe('ContextState', () => {
let newContextMatch = baseState.analyzeTransition(existingContext, toWrapperDistribution(transform));
assert.isOk(newContextMatch?.final);
assert.deepEqual(newContextMatch?.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);

// if(!newContextMatch.final.tokenization.alignment.canAlign) {
// assert.fail("context alignment failed");
// }
// assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0);
// assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, -2);
});

it("properly matches and aligns when an implied 'wordbreak' occurs (as when following \"'\")", function() {
Expand All @@ -332,12 +324,6 @@ describe('ContextState', () => {
let state = newContextMatch.final;
assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchModule.inputCount, 1);
assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchModule.inputCount, 1);

// if(!newContextMatch.final.tokenization.alignment.canAlign) {
// assert.fail("context alignment failed");
// }
// assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0);
// assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 1);
})

// Needs improved context-state management (due to 2x tokens)
Expand Down Expand Up @@ -398,12 +384,6 @@ describe('ContextState', () => {
assert.equal(
state.tokenization.tokens[state.tokenization.tokens.length - 1].searchModule.inputCount, 1
);

// if(!newContextMatch.final.tokenization.alignment.canAlign) {
// assert.fail("context alignment failed");
// }
// assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0);
// assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2);
});

it("properly matches and aligns when tail token is modified AND a 'wordbreak' is added'", function() {
Expand All @@ -427,15 +407,9 @@ describe('ContextState', () => {
let state = newContextMatch.final;
assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchModule.inputCount, 1);
assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchModule.inputCount, 1);

// if(!newContextMatch.final.tokenization.alignment.canAlign) {
// assert.fail("context alignment failed");
// }
// assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0);
// assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2);
});

it('handles case where tail token is split into three rather than two', function() {
it.skip('handles case where tail token is split into three rather than two', function() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the skipped tests (in this and the other files) will be re-enabled in one of the following PRs?

let baseContext = models.tokenize(defaultBreaker, {
left: "text'", startOfBuffer: true, endOfBuffer: true
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ describe('ContextTokenization', function() {
}
});

it('handles case that triggers a token merge: can+\'+t', () => {
it.skip('handles case that triggers a token merge: can+\'+t', () => {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: double-check - is this still valid? I thought I reworked things to be supported at this point; this change may be a holdover from a previous implementation strategy.

const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\''];
const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t)));

Expand Down Expand Up @@ -625,7 +625,7 @@ describe('ContextTokenization', function() {
});
});

it('handles case that triggers a token split: can\' +. => can, \', .', () => {
it.skip('handles case that triggers a token split: can\' +. => can, \', .', () => {
const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\''];
const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t)));

Expand Down
Loading