Skip to content

Commit 00b3d93

Browse files
committed
fix(nemo): scope ASR tests and address review fixes
Keep the shared ASR pipeline suite focused on the public Nemo contract and move adapter-specific windowing, retranscription, cache-ownership, and disposal coverage into a dedicated Nemo pipeline test file. Narrow the source diff by removing explanatory Nemo comments and reverting unrelated upstream-only tweaks, while also fixing the review findings around cursor snap-forward merging, tokenizer vocab-shape handling, empty timestamp validation, and cache borrow/release semantics for active inference. Verification: - node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --config jest.config.mjs --runInBand tests/models.test.js -t "nemo_conformer_tdt" - node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --config jest.config.mjs --runInBand tests/pipelines.test.js -t "Nemo Conformer TDT pipeline adapter|Automatic Speech Recognition"
1 parent f59ba06 commit 00b3d93

11 files changed

Lines changed: 1020 additions & 876 deletions

packages/transformers/src/models/modeling_utils.js

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -877,9 +877,8 @@ export class PreTrainedModel extends Callable {
877877
if (input_name in model_inputs) {
878878
if (inputs) {
879879
throw new Error(
880-
'`inputs` was passed alongside ' +
881-
`\`${input_name}\` which is not allowed. ` +
882-
`Make sure to either pass \`inputs\` or \`${input_name}\`=...`,
880+
'`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. ' +
881+
'Make sure to either pass {inputs} or {input_name}=...',
883882
);
884883
}
885884
} else {

packages/transformers/src/models/nemo_conformer_tdt/feature_extraction_nemo_conformer_tdt.js

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,21 @@ import { computeTemporalDeltas } from './transducer_deltas.js';
77

88
const EPSILON = 1e-5;
99
export const NEMO_FEATURE_OUTPUT_OWNERSHIP = Symbol('NemoConformerTDTFeatureOutputOwnership');
10+
export const NEMO_FEATURE_OUTPUT_RELEASE = Symbol('NemoConformerTDTFeatureOutputRelease');
1011

11-
function tagNemoFeatureOutputOwnership(value, cacheOwnsTensors) {
12+
function tagNemoFeatureOutputOwnership(value, cacheOwnsTensors, release = null) {
1213
Object.defineProperty(value, NEMO_FEATURE_OUTPUT_OWNERSHIP, {
1314
value: cacheOwnsTensors,
1415
enumerable: false,
1516
configurable: true,
1617
});
18+
if (release) {
19+
Object.defineProperty(value, NEMO_FEATURE_OUTPUT_RELEASE, {
20+
value: release,
21+
enumerable: false,
22+
configurable: true,
23+
});
24+
}
1725
return value;
1826
}
1927

@@ -152,14 +160,22 @@ export class NemoConformerTDTFeatureExtractor extends FeatureExtractor {
152160

153161
if (this.feature_cache) {
154162
const key = `${createAudioCacheKey(audio, this.config.sampling_rate)}:${this.delta_order}:${this.delta_window}:${this.delta_concatenate}`;
155-
const cached = this.feature_cache.get(key);
163+
const cached = this.feature_cache.acquire(key);
156164
if (cached) {
157-
return tagNemoFeatureOutputOwnership({ ...cached }, true);
165+
return tagNemoFeatureOutputOwnership({ ...cached.value }, true, cached.release);
158166
}
159167

160168
const extracted = await this._extract(audio);
161169
const cacheOwnsTensors = this.feature_cache.set(key, extracted);
162-
return tagNemoFeatureOutputOwnership({ ...extracted }, cacheOwnsTensors);
170+
if (!cacheOwnsTensors) {
171+
return tagNemoFeatureOutputOwnership({ ...extracted }, false);
172+
}
173+
174+
const borrowed = this.feature_cache.acquire(key);
175+
if (!borrowed) {
176+
return tagNemoFeatureOutputOwnership({ ...extracted }, false);
177+
}
178+
return tagNemoFeatureOutputOwnership({ ...borrowed.value }, true, borrowed.release);
163179
}
164180

165181
return tagNemoFeatureOutputOwnership(await this._extract(audio), false);

packages/transformers/src/models/nemo_conformer_tdt/modeling_nemo_conformer_tdt.js

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -945,10 +945,8 @@ export class NemoConformerForTDT extends NemoConformerTDTPreTrainedModel {
945945
}
946946
}
947947

948-
// Register with ModelRegistry so get_model_files / progress_callback enumerate
949-
// the correct ONNX files: encoder_model + decoder_model_merged.
950-
MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT); // model_type key
951-
MODEL_TYPE_MAPPING.set('NemoConformerForTDT', MODEL_TYPES.NemoConformerTDT); // architecture key
948+
MODEL_TYPE_MAPPING.set('nemo-conformer-tdt', MODEL_TYPES.NemoConformerTDT);
949+
MODEL_TYPE_MAPPING.set('NemoConformerForTDT', MODEL_TYPES.NemoConformerTDT);
952950
MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerTDTPreTrainedModel', NemoConformerTDTPreTrainedModel);
953951
MODEL_NAME_TO_CLASS_MAPPING.set('NemoConformerForTDT', NemoConformerForTDT);
954952
MODEL_CLASS_TO_NAME_MAPPING.set(NemoConformerTDTPreTrainedModel, 'NemoConformerTDTPreTrainedModel');

packages/transformers/src/models/nemo_conformer_tdt/pipeline_nemo_conformer_tdt.js

Lines changed: 26 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
import { Tensor } from '../../utils/tensor.js';
2-
import { NEMO_FEATURE_OUTPUT_OWNERSHIP } from './feature_extraction_nemo_conformer_tdt.js';
2+
import { NEMO_FEATURE_OUTPUT_OWNERSHIP, NEMO_FEATURE_OUTPUT_RELEASE } from './feature_extraction_nemo_conformer_tdt.js';
33
import {
44
buildWordChunks,
55
buildNemoSegmentChunks,
66
joinTimedWords,
77
partitionNemoWordsIntoSegments,
88
} from './transducer_segment_offsets.js';
9-
import {
10-
dedupeMergedWords,
11-
} from './transducer_window_merge.js';
9+
import { dedupeMergedWords } from './transducer_window_merge.js';
1210

1311
const NEMO_AUTO_WINDOW_THRESHOLD_S = 180;
1412
const NEMO_MIN_CHUNK_LENGTH_S = 20;
@@ -49,6 +47,13 @@ function disposeNemoPipelineInputs(inputs) {
4947
}
5048
}
5149

50+
function releaseNemoPipelineInputs(inputs) {
51+
const release = inputs?.[NEMO_FEATURE_OUTPUT_RELEASE];
52+
if (typeof release === 'function') {
53+
release();
54+
}
55+
}
56+
5257
function normalizeNemoChunkLengthS(value) {
5358
const num = Number(value);
5459
if (!Number.isFinite(num) || num <= 0) {
@@ -85,7 +90,7 @@ function normalizeNemoSegmentText(text) {
8590
return String(text ?? '')
8691
.normalize('NFKC')
8792
.replace(/[]/g, '"')
88-
.replace(/[]/g, '\'')
93+
.replace(/[]/g, "'")
8994
.replace(/\s+/g, ' ')
9095
.trim()
9196
.toLowerCase();
@@ -97,9 +102,10 @@ function isDuplicateFinalizedNemoSegment(finalizedSegments, segment) {
97102
return false;
98103
}
99104

100-
return finalizedSegments.some((candidate) =>
101-
normalizeNemoSegmentText(candidate.text) === normalized &&
102-
Math.abs(candidate.timestamp[1] - segment.timestamp[1]) < NEMO_SEGMENT_DEDUP_TOLERANCE_S,
105+
return finalizedSegments.some(
106+
(candidate) =>
107+
normalizeNemoSegmentText(candidate.text) === normalized &&
108+
Math.abs(candidate.timestamp[1] - segment.timestamp[1]) < NEMO_SEGMENT_DEDUP_TOLERANCE_S,
103109
);
104110
}
105111

@@ -139,13 +145,7 @@ function relocateNemoCursorToNearbyGap(target_s, words) {
139145
return best;
140146
}
141147

142-
async function runNemoAutoSentenceWindowing({
143-
audio,
144-
sampling_rate,
145-
chunk_length_s,
146-
tokenizer,
147-
runNemoTranscribe,
148-
}) {
148+
async function runNemoAutoSentenceWindowing({ audio, sampling_rate, chunk_length_s, tokenizer, runNemoTranscribe }) {
149149
const audio_duration_s = audio.length / sampling_rate;
150150
const fallback_overlap_s = Math.min(NEMO_AUTO_WINDOW_FALLBACK_OVERLAP_S, Math.max(0, chunk_length_s - 1));
151151
const fallback_advance_s = Math.max(1, chunk_length_s - fallback_overlap_s);
@@ -159,7 +159,11 @@ async function runNemoAutoSentenceWindowing({
159159
let start_s = 0;
160160
let shouldMergePending = false;
161161

162-
for (let windowIndex = 0; windowIndex < maxWindows && start_s < audio_duration_s - NEMO_AUTO_WINDOW_EPSILON_S; ++windowIndex) {
162+
for (
163+
let windowIndex = 0;
164+
windowIndex < maxWindows && start_s < audio_duration_s - NEMO_AUTO_WINDOW_EPSILON_S;
165+
++windowIndex
166+
) {
163167
const end_s = Math.min(audio_duration_s, start_s + chunk_length_s);
164168
const start_sample = Math.max(0, Math.min(audio.length - 1, Math.floor(start_s * sampling_rate)));
165169
const end_sample = Math.max(start_sample + 1, Math.min(audio.length, Math.ceil(end_s * sampling_rate)));
@@ -199,12 +203,11 @@ async function runNemoAutoSentenceWindowing({
199203
}
200204

201205
pendingWords = dedupeMergedWords(pendingSegment.words);
202-
shouldMergePending = false;
203-
204206
const next_start_s = Math.min(
205207
audio_duration_s,
206208
relocateNemoCursorToNearbyGap(pendingStart_s, windowWords),
207209
);
210+
shouldMergePending = next_start_s > pendingStart_s + NEMO_AUTO_WINDOW_EPSILON_S;
208211
if (next_start_s > start_s + NEMO_AUTO_WINDOW_EPSILON_S) {
209212
start_s = next_start_s;
210213
continue;
@@ -250,14 +253,7 @@ async function runNemoAutoSentenceWindowing({
250253
* prepareAudios: (audio: any[], sampling_rate: number) => Promise<(Float32Array|Float64Array)[]>,
251254
* }} options
252255
*/
253-
export async function runNemoConformerTDTPipeline({
254-
model,
255-
processor,
256-
tokenizer,
257-
audio,
258-
kwargs,
259-
prepareAudios,
260-
}) {
256+
export async function runNemoConformerTDTPipeline({ model, processor, tokenizer, audio, kwargs, prepareAudios }) {
261257
if (typeof model?.transcribe !== 'function') {
262258
throw new Error('Nemo Conformer TDT model does not expose a `transcribe` method.');
263259
}
@@ -292,7 +288,9 @@ export async function runNemoConformerTDTPipeline({
292288
try {
293289
return await model.transcribe(inputs, decodeOptions);
294290
} finally {
295-
if (!cacheOwnsTensors) {
291+
if (cacheOwnsTensors) {
292+
releaseNemoPipelineInputs(inputs);
293+
} else {
296294
disposeNemoPipelineInputs(inputs);
297295
}
298296
}
@@ -303,11 +301,7 @@ export async function runNemoConformerTDTPipeline({
303301
const audio_duration_s = aud.length / sampling_rate;
304302
const autoWindowing = requested_chunk_length_s <= 0 && audio_duration_s > NEMO_AUTO_WINDOW_THRESHOLD_S;
305303
const chunk_length_s =
306-
requested_chunk_length_s > 0
307-
? requested_chunk_length_s
308-
: autoWindowing
309-
? NEMO_AUTO_CHUNK_LENGTH_S
310-
: 0;
304+
requested_chunk_length_s > 0 ? requested_chunk_length_s : autoWindowing ? NEMO_AUTO_CHUNK_LENGTH_S : 0;
311305
const useSentenceWindowing = chunk_length_s > 0;
312306

313307
if (useSentenceWindowing) {

packages/transformers/src/models/nemo_conformer_tdt/transducer_cache.js

Lines changed: 68 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,31 @@ export class FeatureLRUCache {
5050
* @returns {any|null}
5151
*/
5252
get(key) {
53-
const entry = this.cache.get(key);
53+
const entry = this._touch(key);
5454
if (!entry) return null;
55-
this.cache.delete(key);
56-
this.cache.set(key, entry);
5755
return entry.value;
5856
}
5957

58+
/**
59+
* @param {string} key
60+
* @returns {{ value: any, release: () => void } | null}
61+
*/
62+
acquire(key) {
63+
const entry = this._touch(key);
64+
if (!entry) return null;
65+
66+
entry.borrowers += 1;
67+
let released = false;
68+
return {
69+
value: entry.value,
70+
release: () => {
71+
if (released) return;
72+
released = true;
73+
this._releaseEntry(entry);
74+
},
75+
};
76+
}
77+
6078
/**
6179
* @param {string} key
6280
* @param {any} value
@@ -75,12 +93,12 @@ export class FeatureLRUCache {
7593
const existing = this.cache.get(key);
7694
if (existing?.value === value) {
7795
// Refresh recency for unchanged value without invalidating caller-owned references.
78-
this.cache.delete(key);
7996
if (existing.size_bytes <= max_bytes) {
97+
this.cache.delete(key);
8098
this.cache.set(key, existing);
8199
return true;
82100
} else {
83-
this.current_size_bytes -= existing.size_bytes;
101+
this._deleteEntry(key, existing);
84102
return false;
85103
}
86104
}
@@ -89,31 +107,30 @@ export class FeatureLRUCache {
89107
if (size_bytes > max_bytes) {
90108
// Cannot fit in cache: keep caller ownership and skip caching.
91109
if (existing) {
92-
disposeCachedValue(existing.value);
93-
this.current_size_bytes -= existing.size_bytes;
94-
this.cache.delete(key);
110+
this._deleteEntry(key, existing);
95111
}
96112
return false;
97113
}
98114

99115
if (existing) {
100-
disposeCachedValue(existing.value);
101-
this.current_size_bytes -= existing.size_bytes;
102-
this.cache.delete(key);
116+
this._deleteEntry(key, existing);
103117
}
104118

105-
this.cache.set(key, { value, size_bytes });
119+
this.cache.set(key, {
120+
value,
121+
size_bytes,
122+
borrowers: 0,
123+
pendingDispose: false,
124+
});
106125
this.current_size_bytes += size_bytes;
107126
this._evict();
108127
return this.cache.get(key)?.value === value;
109128
}
110129

111130
clear() {
112-
for (const { value } of this.cache.values()) {
113-
disposeCachedValue(value);
131+
for (const [key, entry] of Array.from(this.cache.entries())) {
132+
this._deleteEntry(key, entry);
114133
}
115-
this.cache.clear();
116-
this.current_size_bytes = 0;
117134
}
118135

119136
stats() {
@@ -131,9 +148,41 @@ export class FeatureLRUCache {
131148
const oldest_key = this.cache.keys().next().value;
132149
if (oldest_key === undefined) break;
133150
const oldest = this.cache.get(oldest_key);
134-
this.cache.delete(oldest_key);
135-
disposeCachedValue(oldest?.value);
136-
this.current_size_bytes -= oldest?.size_bytes ?? 0;
151+
if (!oldest) break;
152+
this._deleteEntry(oldest_key, oldest);
153+
}
154+
}
155+
156+
_touch(key) {
157+
const entry = this.cache.get(key);
158+
if (!entry) return null;
159+
this.cache.delete(key);
160+
this.cache.set(key, entry);
161+
return entry;
162+
}
163+
164+
_deleteEntry(key, entry) {
165+
const current = this.cache.get(key);
166+
if (current !== entry) {
167+
return;
168+
}
169+
170+
this.cache.delete(key);
171+
this.current_size_bytes -= entry.size_bytes;
172+
if (entry.borrowers > 0) {
173+
entry.pendingDispose = true;
174+
} else {
175+
disposeCachedValue(entry.value);
176+
}
177+
}
178+
179+
_releaseEntry(entry) {
180+
if (entry.borrowers > 0) {
181+
entry.borrowers -= 1;
182+
}
183+
if (entry.borrowers === 0 && entry.pendingDispose) {
184+
entry.pendingDispose = false;
185+
disposeCachedValue(entry.value);
137186
}
138187
}
139188
}

0 commit comments

Comments
 (0)