@@ -76,8 +76,7 @@ export class SearchCluster implements SearchSpace {
7676 this . selectionQueue . enqueueAll ( inboundPaths ) ;
7777 } else {
7878 const model = arg2 as LexicalModel ;
79- const rootNode = new SearchNode ( model . traverseFromRoot ( ) , this . spaceId , ( s ) => model . toKey ( s ) ) ;
80- const rootPath = new SearchPath ( rootNode ) ;
79+ const rootPath = new SearchPath ( model ) ;
8180 this . selectionQueue . enqueue ( rootPath ) ;
8281 }
8382
@@ -168,4 +167,192 @@ export class SearchCluster implements SearchSpace {
168167 public stopTrackingResults ( ) {
169168 delete this . completedPaths ;
170169 }
170+
171+ public get parents ( ) : ReadonlyArray < SearchPath > {
172+ return this . selectionQueue . toArray ( ) ;
173+ }
174+
175+ public get codepointLength ( ) : number {
176+ return this . parents ?. [ 0 ] . codepointLength ?? 0 ;
177+ }
178+
179+ // public merge(space: SearchSpace): SearchSpace {
180+ // // just... iterate through entries to construct an extended version of THIS
181+ // // search-space.
182+ // // - though... we aren't actually set up to... DO that, are we?
183+ // // - ... .inputs? That might actually work!
184+ // // - issue: merging previously split inputs / input paths. How to identify those?
185+ // // - ** wait: they share the same transform ID on both sides! **
186+ // // - alternative thought: could demark a source path spaceID on split-off paths?
187+ // // - or... maybe just a 'split ID'?
188+ // // - so... that's enough, right? For our purposes?
189+ // // - but split + split again are not technically impossible...
190+ // // - and even then, isn't that actually overcomplicating things?
191+ // //
192+ // // Needs a new spaceID, of course - at each appended step, to be clear.
193+ // //
194+ // //
195+ // // spaceID => SearchSpace, it would seem. Just have the token use the ID
196+ // // from SearchSpace / SearchEdge... wait. SearchEdges with same ID... that's
197+ // // only really available from SearchSpace. May need to brainstorm that a bit.
198+ // // - current design WAS to do the combination on the State level.
199+ // // ... what if SearchSpace re-maps the search path stateIDs to a combined stateID
200+ // // that it emits? Then we don't need to worry about micromanaging search path
201+ // // IDs given how they'll be constructed.
202+ // }
203+
204+ public split ( charIndex : number , model : LexicalModel ) : [ SearchSpace , SearchSpace ] [ ] {
205+ return this . _split ( charIndex , model , new Map ( ) ) ;
206+ }
207+
208+ // splitCache:
209+ // - key: id of original search path being split
210+ // - value's index: the number of preserved codepoints
211+ // - value's instance at the index: the spun-off search space
212+ private _split (
213+ charIndex : number ,
214+ model : LexicalModel ,
215+ splitCache : Map < number , { head : SearchCluster , tail : SearchCluster } [ ] >
216+ ) : [ SearchCluster , SearchCluster ] [ ] {
217+ if ( this . codepointLength == charIndex ) {
218+ console . log ( 'a' ) ;
219+ // If we're splitting at the tail end of an existing space, just re-use
220+ // the space and pass along an empty one for the end.
221+ return [ [ this , new SearchCluster ( model ) ] ] ;
222+ }
223+
224+ // Ensure common split-ancestors still resolve to the same entity.
225+ const componentPaths = this . selectionQueue . toArray ( ) ;
226+ let baseResultSet : [ SearchCluster , SearchCluster ] [ ] = [ ] ;
227+
228+ const deduplicateSplitResults = ( results : [ SearchCluster , SearchCluster ] [ ] ) => {
229+ // Re-merge paths that converge to the same point.
230+ const duplicateMap : Map < number , [ SearchCluster , SearchCluster ] [ ] > = new Map ( ) ;
231+ results . forEach ( result => {
232+ const headSpaceId = result [ 0 ] . spaceId ;
233+ const arr : [ SearchCluster , SearchCluster ] [ ] = duplicateMap . get ( headSpaceId ) ?? [ ] ;
234+ arr . push ( result ) ;
235+ duplicateMap . set ( result [ 0 ] . spaceId , arr ) ;
236+ } ) ;
237+
238+ const finalResults : [ SearchCluster , SearchCluster ] [ ] = [ ] ;
239+ for ( const splits of duplicateMap . values ( ) ) {
240+ const headSpace = splits [ 0 ] [ 0 ] ;
241+
242+ // const uniqueTailSpaces = [...splits.reduce((set, curr) => {
243+ // if(!set.has(curr[1].spaceId)) {
244+ // set.set(curr[1].spaceId, curr[1]);
245+ // } else {
246+ // console.log('z');
247+ // }
248+
249+ // return set;
250+ // }, new Map<number, SearchSpace>()).values()];
251+
252+ const paths = splits . flatMap ( split => split [ 1 ] . selectionQueue . toArray ( ) ) ;
253+ const tailSpace = new SearchCluster ( paths ) ;
254+ // const resultPaths: [SearchSpace, SearchSpace][] = uniqueTailSpaces.map((tailSpace) => ([headSpace, tailSpace]));
255+
256+ // resultPaths.forEach(entry => finalResults.push(entry));
257+ finalResults . push ( [ headSpace , tailSpace ] ) ;
258+ }
259+
260+ return finalResults ;
261+ }
262+
263+ const pathFiltering = componentPaths . reduce ( ( filtering , path ) => {
264+ if ( path . codepointLength - path . edgeLength > charIndex ) {
265+ filtering . inParent . push ( path ) ;
266+ } else {
267+ filtering . inCurrent . push ( path ) ;
268+ }
269+
270+ return filtering ;
271+ } , { inParent : [ ] as SearchPath [ ] , inCurrent : [ ] as SearchPath [ ] } )
272+
273+ // should filter all that meet the condition (and those that don't)
274+ if ( pathFiltering . inParent . length > 0 ) {
275+ const parentResults = pathFiltering . inParent . flatMap ( ( path ) => {
276+ console . log ( `b - ${ path . bestExample . text } ` ) ;
277+ // TODO: resolve!
278+ const results = ( path . parents [ 0 ] as SearchCluster ) . _split ( charIndex , model , splitCache ) ;
279+
280+ return results . map ( ( results ) => {
281+ const tailSpace = new SearchCluster ( [ results [ 1 ] . addInput ( [ ...path . inputs ] , path . bestProbInEdge ) ] ) ;
282+ results [ 1 ] = tailSpace ;
283+ return results ;
284+ } ) ;
285+ } ) ;
286+
287+ baseResultSet = parentResults ;
288+ }
289+
290+ // Re: space IDs - we can't reuse data for anything we're reconstructing
291+ // after the split point. Original space IDs on the left-hand side may
292+ // remain unaltered, but right-hand needs to be re-built from scratch, in
293+ // new SearchPaths / SearchSpaces.
294+ //
295+ // We can optimize how many new spaces/paths we create for the right-hand
296+ // side, though: each starting the same count in, at the same input-offset
297+ // position, should be safe to amalgamate.
298+ const pathResults : [ SearchCluster , SearchCluster ] [ ] = pathFiltering . inCurrent . map ( ( path ) => {
299+ const parentSpace = path . parents [ 0 ] ?? new SearchCluster ( model ) ;
300+ const pathStartIndex = path . codepointLength - path . edgeLength ;
301+ if ( path . codepointLength - path . edgeLength == charIndex ) {
302+ console . log ( `c - ${ path . bestExample . text } ` ) ;
303+ // yay, great case! Splits cleanly on the boundary BEFORE this path, at
304+ // its start.
305+ //
306+ // parentSpace is thus the END of the prior token.
307+ // Start a new one with the current Path.
308+ // return [parentSpace, new SearchSpace(/* new spaceId */, path /* reconstructed, now space ID */)];
309+ const newPath = new SearchCluster ( model ) . addInput ( [ ...path . inputs ] , path . bestProbInEdge ) ;
310+ return [
311+ parentSpace instanceof SearchPath ? new SearchCluster ( [ parentSpace ] ) : parentSpace ,
312+ new SearchCluster ( [ newPath ] )
313+ ] as [ SearchCluster , SearchCluster ] ;
314+ } else {
315+ console . log ( `d - ${ path . bestExample . text } ` ) ;
316+ // OK, so we need to actually split this path in twain.
317+ const pathCharIndex = charIndex - pathStartIndex ;
318+ const results = path . split ( pathCharIndex , model ) ;
319+ console . log ( `pathId: ${ path . spaceId } - ${ splitCache . has ( path . spaceId ) ? 'found' : 'not found' } ` ) ;
320+
321+ const pathSplitCacheArray = splitCache . get ( path . spaceId ) ?? [ ] ;
322+ splitCache . set ( path . spaceId , pathSplitCacheArray ) ;
323+
324+ const newHeadSpace = pathSplitCacheArray [ pathCharIndex ] ?. head ?? new SearchCluster ( [ new SearchPath ( parentSpace , [ ...results [ 0 ] . inputs ] , path . bestProbInEdge ) ] ) ;
325+ const newTailSpace = pathSplitCacheArray [ pathCharIndex ] ?. tail ?? new SearchCluster ( [ new SearchCluster ( model ) . addInput ( [ ...results [ 1 ] . inputs ] , path . bestProbInEdge ) ] ) ;
326+
327+ pathSplitCacheArray [ pathCharIndex ] = {
328+ head : newHeadSpace ,
329+ tail : newTailSpace
330+ }
331+ return [ newHeadSpace , newTailSpace ] ;
332+ }
333+ } ) ;
334+
335+ baseResultSet = pathResults . concat ( baseResultSet ) ;
336+
337+ // From pathResults:
338+ // - LHS deduplicate: if same spaceIDs appear on left-hand side, they're the same space;
339+ // we likely split at the same pointt
340+ // - RHS: check search depth + offset position
341+ // - order by input set likelihood
342+ // - replace other path variants with that
343+ //
344+ // Finally, deduplicate the tuples as much as possible.
345+ // ... wait. Why do we have multiplicity in the paths? We need to be able to reduce things
346+ // down to just 1 + 1 split token, not multiple in each position.
347+ //
348+ // ... first stop: we could just... take the most likely case and ignore the others?
349+ // ... in which case, why evaluate ALL paths?
350+ // - b/c LHS matches could show up multiple times?
351+ //
352+ // Can we mitigate these cases with improved output from the wordbreaker(s) - say,
353+ // about "ambiguous wordbreak" scenarios?
354+
355+ console . log ( `result count: ${ baseResultSet . length } ; results ${ JSON . stringify ( baseResultSet . map ( r => ( [ r [ 0 ] . bestExample . text , r [ 1 ] . bestExample . text ] ) ) ) } ` ) ;
356+ return deduplicateSplitResults ( baseResultSet ) ;
357+ }
171358}
0 commit comments