Skip to content

Commit de3d86c

Browse files
committed
change(web): rework suggestion-alignment helper to use tokenization directly
1 parent 4b5a244 commit de3d86c

8 files changed

Lines changed: 33 additions & 38 deletions

File tree

web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ export class ContextState {
268268
const state = new ContextState(applyTransform(trueInput, context), lexicalModel);
269269
state.tokenization = new ContextTokenization(resultTokenization.tokens, tokenizationAnalysis, resultTokenization.taillessTrueKeystroke);
270270
state.appliedInput = transformDistribution?.[0].sample;
271-
transition.finalize(state, transformDistribution, resultTokenization.taillessTrueKeystroke);
271+
transition.finalize(state, transformDistribution);
272272
transition.revertableTransitionId = appliedSuggestionTransitionId;
273273
return transition;
274274
}

web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ export class ContextTokenization {
118118
* If the final token is new due to a newly-introduced wordboundary traversed
119119
* by the keystroke, this will generally be set to an empty transform that
120120
* 'finalizes' the previous tail token.
121+
*
122+
* (Refer to #12494 for an example case.)
121123
*/
122124
readonly taillessTrueKeystroke: Transform;
123125

web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -47,21 +47,6 @@ export class ContextTransition {
4747
// The transform ID in play.
4848
private _transitionId?: number;
4949

50-
/**
51-
* Indicates the portion of the incoming keystroke data, if any, that applies to
52-
* tokens before the last pre-caret token and thus should not be replaced by predictions
53-
* based upon `state`. If the provided context state + the incoming transform do not
54-
* adequately match the current context, the match attempt will fail with a `null` result.
55-
*
56-
* Should generally be non-null if the token before the caret did not previously exist.
57-
*
58-
* The result may be null if it does not match the prior context state or if bookkeeping
59-
* based upon it is problematic - say, if wordbreaking effects shift due to new input,
60-
* causing a mismatch with the prior state's tokenization.
61-
* (Refer to #12494 for an example case.)
62-
*/
63-
preservationTransform?: Transform;
64-
6550
/**
6651
* When set, indicates that the text insertion point has returned to the endpoint of a
6752
* token last edited by application of a Suggestion. This is not set immediately after
@@ -133,13 +118,12 @@ export class ContextTransition {
133118
* @param preservationTransform Portions of the most likely input that do not contribute to the final token
134119
* in the final context's tokenization.
135120
*/
136-
finalize(state: ContextState, inputDistribution: Distribution<Transform>, preservationTransform?: Transform) {
121+
finalize(state: ContextState, inputDistribution: Distribution<Transform>) {
137122
this._final = state;
138123
this.inputDistribution = inputDistribution;
139124
// Long-term, this should never be null... but we need to allow it at this point
140125
// in the refactoring process.
141126
this._transitionId = inputDistribution?.find((entry) => entry.sample.id !== undefined)?.sample.id;
142-
this.preservationTransform = preservationTransform;
143127
}
144128

145129
/**

web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { defaultWordbreaker, WordBreakProperty } from '@keymanapp/models-wordbre
55

66
import TransformUtils from './transformUtils.js';
77
import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js';
8+
import { ContextTokenization } from './correction/context-tokenization.js';
89
import { ContextTracker } from './correction/context-tracker.js';
910
import { ContextState, determineContextSlideTransform } from './correction/context-state.js';
1011
import { ExecutionTimer } from './correction/execution-timer.js';
@@ -326,6 +327,7 @@ export function determineContextTransition(
326327
*/
327328
export function determineSuggestionAlignment(
328329
transition: ContextTransition,
330+
tokenization: ContextTokenization,
329331
lexicalModel: LexicalModel
330332
): {
331333
/**
@@ -338,7 +340,7 @@ export function determineSuggestionAlignment(
338340
*/
339341
deleteLeft: number
340342
} {
341-
const transitionEdits = transition.final.tokenization.transitionEdits;
343+
const transitionEdits = tokenization.transitionEdits;
342344
const context = transition.base.context;
343345
const postContext = transition.final.context;
344346
const inputTransform = transition.inputDistribution[0].sample;
@@ -349,13 +351,13 @@ export function determineSuggestionAlignment(
349351
const wordbreak = determineModelWordbreaker(lexicalModel);
350352

351353
// Is the token under construction newly-constructed / is there no pre-existing root?
352-
if(transition.preservationTransform && inputTransformMap?.has(1)) {
354+
if(tokenization.taillessTrueKeystroke && inputTransformMap?.has(1)) {
353355
return {
354356
// If the new token is due to whitespace or due to a different input type
355357
// that would likely imply a tokenization boundary, infer 'new word' mode.
356358
// Apply any part of the context change that is not considered to be up
357359
// for correction.
358-
predictionContext: models.applyTransform(transition.preservationTransform, context),
360+
predictionContext: models.applyTransform(tokenization.taillessTrueKeystroke, context),
359361
// As the word/token being corrected/predicted didn't originally exist,
360362
// there's no part of it to 'replace'. (Suggestions are applied to the
361363
// pre-transform state.)
@@ -379,7 +381,7 @@ export function determineSuggestionAlignment(
379381

380382
// Did the wordbreaker (or similar) append a blank token before the caret? If so,
381383
// preserve that by preventing corrections from triggering left-deletion.
382-
if(transition.final.tokenization.tail.isEmptyToken) {
384+
if(tokenization.tail.isEmptyToken) {
383385
deleteLeft = 0;
384386
}
385387

@@ -453,10 +455,6 @@ export async function correctAndEnumerate(
453455
}
454456
}
455457

456-
// No matter the prediction, once we know the root of the prediction, we'll always 'replace' the
457-
// same amount of text. We can handle this before the big 'prediction root' loop.
458-
const { predictionContext: predictionContext, deleteLeft } = determineSuggestionAlignment(transition, lexicalModel);
459-
460458
// TODO: Should we filter backspaces & whitespaces out of the transform distribution?
461459
// Ideally, the answer (in the future) will be no, but leaving it in right now may pose an issue.
462460

@@ -471,6 +469,13 @@ export async function correctAndEnumerate(
471469
// when no fat-finger data is available.
472470
if(!searchSpaces.find(s => s.correctionsEnabled)) {
473471
const wordbreak = determineModelWordbreaker(lexicalModel);
472+
// The one true tokenization: no corrections permitted.
473+
const tokenization = transition.final.tokenization;
474+
475+
// No matter the prediction, once we know the root of the prediction, we'll always 'replace' the
476+
// same amount of text. We can handle this before the big 'prediction root' loop.
477+
const { predictionContext: predictionContext, deleteLeft } = determineSuggestionAlignment(transition, tokenization, lexicalModel);
478+
474479
const predictionRoot = {
475480
sample: {
476481
insert: wordbreak(transition.final.context),
@@ -481,7 +486,7 @@ export async function correctAndEnumerate(
481486
};
482487

483488
const predictions = predictFromCorrections(lexicalModel, [predictionRoot], predictionContext);
484-
predictions.forEach((entry) => entry.preservationTransform = transition.preservationTransform);
489+
predictions.forEach((entry) => entry.preservationTransform = tokenization.taillessTrueKeystroke);
485490

486491
// Only one 'correction' / prediction root is allowed - the actual text.
487492
return {
@@ -501,6 +506,11 @@ export async function correctAndEnumerate(
501506
const searchSpace = searchSpaces.find(s => s.spaceId == match.spaceId);
502507
const tokenization = tokenizations.find(t => t.spaceId == match.spaceId);
503508

509+
// No matter the prediction, once we know the root of the prediction, we'll
510+
// always 'replace' the same amount of text. We can handle this before the
511+
// big 'prediction root' loop.
512+
const { predictionContext, deleteLeft } = determineSuggestionAlignment(transition, tokenization, lexicalModel);
513+
504514
// If our 'match' results in fully deleting the new token, reject it and try again.
505515
if(match.matchSequence.length == 0 && match.inputSequence.length != 0) {
506516
continue;

web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-state.tests.ts

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ describe('ContextState', () => {
248248
assert.isNotNull(newContextMatch?.final);
249249
assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);
250250
// We want to preserve the added whitespace when predicting a token that follows after it.
251-
assert.deepEqual(newContextMatch.preservationTransform, { insert: ' ', deleteLeft: 0 });
251+
assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 });
252252

253253
// The 'wordbreak' transform
254254
let state = newContextMatch?.final;
@@ -279,7 +279,7 @@ describe('ContextState', () => {
279279
assert.isNotNull(newContextMatch?.final);
280280
assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);
281281
// We want to preserve the added whitespace when predicting a token that follows after it.
282-
assert.deepEqual(newContextMatch.preservationTransform, { insert: ' ', deleteLeft: 0 });
282+
assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 });
283283

284284
// The 'wordbreak' transform
285285
let state = newContextMatch?.final;
@@ -326,7 +326,7 @@ describe('ContextState', () => {
326326
let newContextMatch = baseState.analyzeTransition(existingContext, toWrapperDistribution(transform));
327327
assert.isNotNull(newContextMatch?.final);
328328
assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);
329-
assert.deepEqual(newContextMatch.preservationTransform, { insert: '', deleteLeft: 0 });
329+
assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: '', deleteLeft: 0 });
330330

331331
// The 'wordbreak' transform
332332
let state = newContextMatch.final;
@@ -359,7 +359,7 @@ describe('ContextState', () => {
359359
assert.isNotNull(newContextMatch?.final);
360360
assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);
361361
// We want to preserve the added whitespace when predicting a token that follows after it.
362-
assert.deepEqual(newContextMatch.preservationTransform, { insert: ' ', deleteLeft: 0 });
362+
assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 });
363363

364364
// The 'wordbreak' transform
365365
let state = newContextMatch.final;
@@ -390,7 +390,7 @@ describe('ContextState', () => {
390390
assert.isNotNull(newContextMatch?.final);
391391
assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);
392392
// We want to preserve all text preceding the new token when applying a suggestion.
393-
assert.deepEqual(newContextMatch.preservationTransform, { insert: 'd ', deleteLeft: 0});
393+
assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: 'd ', deleteLeft: 0});
394394

395395
// The 'wordbreak' transform
396396
let state = newContextMatch.final;
@@ -421,7 +421,7 @@ describe('ContextState', () => {
421421
assert.isNotNull(newContextMatch?.final);
422422
assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);
423423
// We want to preserve all text preceding the new token when applying a suggestion.
424-
assert.deepEqual(newContextMatch.preservationTransform, { insert: 'tor ', deleteLeft: 0 });
424+
assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: 'tor ', deleteLeft: 0 });
425425

426426
// The 'wordbreak' transform
427427
let state = newContextMatch.final;

web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-transition.tests.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ describe('ContextTransition', () => {
5656
assert.equal(transition.transitionId, 1);
5757
assert.isNotOk(transition.final);
5858
assert.isNotOk(transition.inputDistribution);
59-
assert.isNotOk(transition.preservationTransform);
6059
});
6160

6261
it('deep-copies when given a previous ContextState instance (no `final`)', () => {

web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-alignment.tests.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ describe('determineSuggestionAlignment', () => {
4848
transition.finalize(transition.base, [{sample: { insert: '', deleteLeft: 0 }, p: 1}]);
4949

5050
// transition, model
51-
const results = determineSuggestionAlignment(transition, plainCasedModel);
51+
const results = determineSuggestionAlignment(transition, transition.final.tokenization, plainCasedModel);
5252

5353
assert.deepEqual(results.predictionContext, context);
5454
assert.equal(results.deleteLeft, "techn".length);
@@ -65,7 +65,7 @@ describe('determineSuggestionAlignment', () => {
6565
const transition = baseState.analyzeTransition(context, [{sample: { insert: '', deleteLeft: 1 }, p: 1}])
6666

6767
// transition, model
68-
const results = determineSuggestionAlignment(transition, plainCasedModel);
68+
const results = determineSuggestionAlignment(transition, transition.final.tokenization, plainCasedModel);
6969

7070
assert.deepEqual(results.predictionContext, context);
7171
assert.equal(results.deleteLeft, "tech".length + 1 /* for the deleted whitespace */);
@@ -82,7 +82,7 @@ describe('determineSuggestionAlignment', () => {
8282
const transition = baseState.analyzeTransition(context, [{sample: { insert: 'n', deleteLeft: 1 }, p: 1}])
8383

8484
// transition, model
85-
const results = determineSuggestionAlignment(transition, plainCasedModel);
85+
const results = determineSuggestionAlignment(transition, transition.final.tokenization, plainCasedModel);
8686

8787
assert.deepEqual(results.predictionContext, context);
8888
assert.equal(results.deleteLeft, "techn".length + 1 /* for the deleted whitespace */);

web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ describe('determineContextTransition', () => {
108108
assert.equal(transition.final.context.left, targetContext.left);
109109
assert.equal(transition.final.context.right ?? "", targetContext.right ?? "");
110110
assert.sameDeepOrderedMembers(transition.inputDistribution, inputDistribution);
111-
assert.isNotOk(transition.preservationTransform);
111+
assert.isNotOk(transition.final.tokenization.taillessTrueKeystroke);
112112
assert.equal(transition.transitionId, 1);
113113
} finally {
114114
warningEmitterSpy.restore();

0 commit comments

Comments
 (0)