change(web): rework suggestion-alignment helper to use tokenization directly

jahorton · jahorton · commit de3d86c8daf9 · 2025-10-23T11:16:46.000-05:00
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts
@@ -268,7 +268,7 @@ export class ContextState {
     const state = new ContextState(applyTransform(trueInput, context), lexicalModel);
     state.tokenization =  new ContextTokenization(resultTokenization.tokens, tokenizationAnalysis, resultTokenization.taillessTrueKeystroke);
     state.appliedInput = transformDistribution?.[0].sample;
-    transition.finalize(state, transformDistribution, resultTokenization.taillessTrueKeystroke);
+    transition.finalize(state, transformDistribution);
     transition.revertableTransitionId = appliedSuggestionTransitionId;
     return transition;
   }
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
@@ -118,6 +118,8 @@ export class ContextTokenization {
    * If the final token is new due to a newly-introduced wordboundary traversed
    * by the keystroke, this will generally be set to an empty transform that
    * 'finalizes' the previous tail token.
+   *
+   * (Refer to #12494 for an example case.)
    */
   readonly taillessTrueKeystroke: Transform;
 
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts
@@ -47,21 +47,6 @@ export class ContextTransition {
   // The transform ID in play.
   private _transitionId?: number;
 
-  /**
-   * Indicates the portion of the incoming keystroke data, if any, that applies to
-   * tokens before the last pre-caret token and thus should not be replaced by predictions
-   * based upon `state`.  If the provided context state + the incoming transform do not
-   * adequately match the current context, the match attempt will fail with a `null` result.
-   *
-   * Should generally be non-null if the token before the caret did not previously exist.
-   *
-   * The result may be null if it does not match the prior context state or if bookkeeping
-   * based upon it is problematic - say, if wordbreaking effects shift due to new input,
-   * causing a mismatch with the prior state's tokenization.
-   * (Refer to #12494 for an example case.)
-   */
-  preservationTransform?: Transform;
-
   /**
    * When set, indicates that the text insertion point has returned to the endpoint of a
    * token last edited by application of a Suggestion.  This is not set immediately after
@@ -133,13 +118,12 @@ export class ContextTransition {
    * @param preservationTransform Portions of the most likely input that do not contribute to the final token
    * in the final context's tokenization.
    */
-  finalize(state: ContextState, inputDistribution: Distribution<Transform>, preservationTransform?: Transform) {
+  finalize(state: ContextState, inputDistribution: Distribution<Transform>) {
     this._final = state;
     this.inputDistribution = inputDistribution;
     // Long-term, this should never be null... but we need to allow it at this point
     // in the refactoring process.
     this._transitionId = inputDistribution?.find((entry) => entry.sample.id !== undefined)?.sample.id;
-    this.preservationTransform = preservationTransform;
   }
 
   /**
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
@@ -5,6 +5,7 @@ import { defaultWordbreaker, WordBreakProperty } from '@keymanapp/models-wordbre
 
 import TransformUtils from './transformUtils.js';
 import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js';
+import { ContextTokenization } from './correction/context-tokenization.js';
 import { ContextTracker } from './correction/context-tracker.js';
 import { ContextState, determineContextSlideTransform } from './correction/context-state.js';
 import { ExecutionTimer } from './correction/execution-timer.js';
@@ -326,6 +327,7 @@ export function determineContextTransition(
  */
 export function determineSuggestionAlignment(
   transition: ContextTransition,
+  tokenization: ContextTokenization,
   lexicalModel: LexicalModel
 ): {
   /**
@@ -338,7 +340,7 @@ export function determineSuggestionAlignment(
    */
   deleteLeft: number
 } {
-  const transitionEdits = transition.final.tokenization.transitionEdits;
+  const transitionEdits = tokenization.transitionEdits;
   const context = transition.base.context;
   const postContext = transition.final.context;
   const inputTransform = transition.inputDistribution[0].sample;
@@ -349,13 +351,13 @@ export function determineSuggestionAlignment(
   const wordbreak = determineModelWordbreaker(lexicalModel);
 
   // Is the token under construction newly-constructed / is there no pre-existing root?
-  if(transition.preservationTransform && inputTransformMap?.has(1)) {
+  if(tokenization.taillessTrueKeystroke && inputTransformMap?.has(1)) {
     return {
       // If the new token is due to whitespace or due to a different input type
       // that would likely imply a tokenization boundary, infer 'new word' mode.
       // Apply any part of the context change that is not considered to be up
       // for correction.
-      predictionContext: models.applyTransform(transition.preservationTransform, context),
+      predictionContext: models.applyTransform(tokenization.taillessTrueKeystroke, context),
       // As the word/token being corrected/predicted didn't originally exist,
       // there's no part of it to 'replace'.  (Suggestions are applied to the
       // pre-transform state.)
@@ -379,7 +381,7 @@ export function determineSuggestionAlignment(
 
   // Did the wordbreaker (or similar) append a blank token before the caret?  If so,
   // preserve that by preventing corrections from triggering left-deletion.
-  if(transition.final.tokenization.tail.isEmptyToken) {
+  if(tokenization.tail.isEmptyToken) {
     deleteLeft = 0;
   }
 
@@ -453,10 +455,6 @@ export async function correctAndEnumerate(
     }
   }
 
-  // No matter the prediction, once we know the root of the prediction, we'll always 'replace' the
-  // same amount of text.  We can handle this before the big 'prediction root' loop.
-  const { predictionContext: predictionContext, deleteLeft } = determineSuggestionAlignment(transition, lexicalModel);
-
   // TODO:  Should we filter backspaces & whitespaces out of the transform distribution?
   //        Ideally, the answer (in the future) will be no, but leaving it in right now may pose an issue.
 
@@ -471,6 +469,13 @@ export async function correctAndEnumerate(
   // when no fat-finger data is available.
   if(!searchSpaces.find(s => s.correctionsEnabled)) {
     const wordbreak = determineModelWordbreaker(lexicalModel);
+    // The one true tokenization:  no corrections permitted.
+    const tokenization = transition.final.tokenization;
+
+    // No matter the prediction, once we know the root of the prediction, we'll always 'replace' the
+    // same amount of text.  We can handle this before the big 'prediction root' loop.
+    const { predictionContext: predictionContext, deleteLeft } = determineSuggestionAlignment(transition, tokenization, lexicalModel);
+
     const predictionRoot = {
       sample: {
         insert: wordbreak(transition.final.context),
@@ -481,7 +486,7 @@ export async function correctAndEnumerate(
     };
 
     const predictions = predictFromCorrections(lexicalModel, [predictionRoot], predictionContext);
-    predictions.forEach((entry) => entry.preservationTransform = transition.preservationTransform);
+    predictions.forEach((entry) => entry.preservationTransform = tokenization.taillessTrueKeystroke);
 
     // Only one 'correction' / prediction root is allowed - the actual text.
     return {
@@ -501,6 +506,11 @@ export async function correctAndEnumerate(
     const searchSpace = searchSpaces.find(s => s.spaceId == match.spaceId);
     const tokenization = tokenizations.find(t => t.spaceId == match.spaceId);
 
+    // No matter the prediction, once we know the root of the prediction, we'll
+    // always 'replace' the same amount of text.  We can handle this before the
+    // big 'prediction root' loop.
+    const { predictionContext, deleteLeft } = determineSuggestionAlignment(transition, tokenization, lexicalModel);
+
     // If our 'match' results in fully deleting the new token, reject it and try again.
     if(match.matchSequence.length == 0 && match.inputSequence.length != 0) {
       continue;
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-state.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-state.tests.ts
@@ -248,7 +248,7 @@ describe('ContextState', () => {
       assert.isNotNull(newContextMatch?.final);
       assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);
       // We want to preserve the added whitespace when predicting a token that follows after it.
-      assert.deepEqual(newContextMatch.preservationTransform, { insert: ' ', deleteLeft: 0 });
+      assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 });
 
       // The 'wordbreak' transform
       let state = newContextMatch?.final;
@@ -279,7 +279,7 @@ describe('ContextState', () => {
       assert.isNotNull(newContextMatch?.final);
       assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);
       // We want to preserve the added whitespace when predicting a token that follows after it.
-      assert.deepEqual(newContextMatch.preservationTransform, { insert: ' ', deleteLeft: 0 });
+      assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 });
 
       // The 'wordbreak' transform
       let state = newContextMatch?.final;
@@ -326,7 +326,7 @@ describe('ContextState', () => {
       let newContextMatch = baseState.analyzeTransition(existingContext, toWrapperDistribution(transform));
       assert.isNotNull(newContextMatch?.final);
       assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);
-      assert.deepEqual(newContextMatch.preservationTransform, { insert: '', deleteLeft: 0 });
+      assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: '', deleteLeft: 0 });
 
       // The 'wordbreak' transform
       let state = newContextMatch.final;
@@ -359,7 +359,7 @@ describe('ContextState', () => {
       assert.isNotNull(newContextMatch?.final);
       assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);
       // We want to preserve the added whitespace when predicting a token that follows after it.
-      assert.deepEqual(newContextMatch.preservationTransform, { insert: ' ', deleteLeft: 0 });
+      assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 });
 
       // The 'wordbreak' transform
       let state = newContextMatch.final;
@@ -390,7 +390,7 @@ describe('ContextState', () => {
       assert.isNotNull(newContextMatch?.final);
       assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);
       // We want to preserve all text preceding the new token when applying a suggestion.
-      assert.deepEqual(newContextMatch.preservationTransform, { insert: 'd ', deleteLeft: 0});
+      assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: 'd ', deleteLeft: 0});
 
       // The 'wordbreak' transform
       let state = newContextMatch.final;
@@ -421,7 +421,7 @@ describe('ContextState', () => {
       assert.isNotNull(newContextMatch?.final);
       assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens);
       // We want to preserve all text preceding the new token when applying a suggestion.
-      assert.deepEqual(newContextMatch.preservationTransform, { insert: 'tor ', deleteLeft: 0 });
+      assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: 'tor ', deleteLeft: 0 });
 
       // The 'wordbreak' transform
       let state = newContextMatch.final;
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-transition.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-transition.tests.ts
@@ -56,7 +56,6 @@ describe('ContextTransition', () => {
       assert.equal(transition.transitionId, 1);
       assert.isNotOk(transition.final);
       assert.isNotOk(transition.inputDistribution);
-      assert.isNotOk(transition.preservationTransform);
     });
 
     it('deep-copies when given a previous ContextState instance (no `final`)', () => {
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-alignment.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-alignment.tests.ts
@@ -48,7 +48,7 @@ describe('determineSuggestionAlignment', () => {
     transition.finalize(transition.base, [{sample: { insert: '', deleteLeft: 0 }, p: 1}]);
 
     // transition, model
-    const results = determineSuggestionAlignment(transition, plainCasedModel);
+    const results = determineSuggestionAlignment(transition, transition.final.tokenization, plainCasedModel);
 
     assert.deepEqual(results.predictionContext, context);
     assert.equal(results.deleteLeft, "techn".length);
@@ -65,7 +65,7 @@ describe('determineSuggestionAlignment', () => {
     const transition = baseState.analyzeTransition(context, [{sample: { insert: '', deleteLeft: 1 }, p: 1}])
 
     // transition, model
-    const results = determineSuggestionAlignment(transition, plainCasedModel);
+    const results = determineSuggestionAlignment(transition, transition.final.tokenization, plainCasedModel);
 
     assert.deepEqual(results.predictionContext, context);
     assert.equal(results.deleteLeft, "tech".length + 1 /* for the deleted whitespace */);
@@ -82,7 +82,7 @@ describe('determineSuggestionAlignment', () => {
     const transition = baseState.analyzeTransition(context, [{sample: { insert: 'n', deleteLeft: 1 }, p: 1}])
 
     // transition, model
-    const results = determineSuggestionAlignment(transition, plainCasedModel);
+    const results = determineSuggestionAlignment(transition, transition.final.tokenization, plainCasedModel);
 
     assert.deepEqual(results.predictionContext, context);
     assert.equal(results.deleteLeft, "techn".length + 1 /* for the deleted whitespace */);
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts
@@ -108,7 +108,7 @@ describe('determineContextTransition', () => {
       assert.equal(transition.final.context.left, targetContext.left);
       assert.equal(transition.final.context.right ?? "", targetContext.right ?? "");
       assert.sameDeepOrderedMembers(transition.inputDistribution, inputDistribution);
-      assert.isNotOk(transition.preservationTransform);
+      assert.isNotOk(transition.final.tokenization.taillessTrueKeystroke);
       assert.equal(transition.transitionId, 1);
     } finally {
       warningEmitterSpy.restore();

Original file line number	Diff line number	Diff line change
`@@ -268,7 +268,7 @@ export class ContextState {`
`268`	`268`	`const state = new ContextState(applyTransform(trueInput, context), lexicalModel);`
`269`	`269`	`state.tokenization = new ContextTokenization(resultTokenization.tokens, tokenizationAnalysis, resultTokenization.taillessTrueKeystroke);`
`270`	`270`	`state.appliedInput = transformDistribution?.[0].sample;`
`271`		`- transition.finalize(state, transformDistribution, resultTokenization.taillessTrueKeystroke);`
	`271`	`+ transition.finalize(state, transformDistribution);`
`272`	`272`	`transition.revertableTransitionId = appliedSuggestionTransitionId;`
`273`	`273`	`return transition;`
`274`	`274`	`}`