Skip to content

Commit 80f3f2a

Browse files
committed
fix: SenseVoice+spk_model crash and torchaudio>=2.11 compatibility
1. auto_model.py: When ASR model (e.g. SenseVoice) doesn't produce timestamps, automatically fall back to vad_segment mode for speaker diarization instead of crashing. Fixes #2945. 2. load_utils.py: Add soundfile as fallback when torchaudio.load fails (torchaudio>=2.11 requires torchcodec). Falls through to ffmpeg only if soundfile also fails.
1 parent 0d824c1 commit 80f3f2a

2 files changed

Lines changed: 14 additions & 4 deletions

File tree

funasr/auto/auto_model.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -819,6 +819,9 @@ def inference_with_vad(self, input, input_len=None, **cfg):
819819
)
820820
# del result['spk_embedding']
821821
sv_output = postprocess(all_segments, None, labels, spk_embedding.cpu())
822+
if self.spk_mode == "punc_segment" and "timestamp" not in result and "timestamps" not in result:
823+
logging.warning("No timestamps in ASR result (e.g. SenseVoice), falling back to vad_segment mode for speaker diarization.")
824+
self.spk_mode = "vad_segment"
822825
if self.spk_mode == "vad_segment": # recover sentence_list
823826
sentence_list = []
824827
for rest, vadsegment in zip(restored_data, vadsegments):

funasr/utils/load_utils.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,17 @@ def load_audio_text_image_video(
113113
if kwargs.get("reduce_channels", True):
114114
data_or_path_or_list = data_or_path_or_list.mean(0)
115115
except:
116-
data_or_path_or_list = _load_audio_ffmpeg(data_or_path_or_list, sr=fs)
117-
data_or_path_or_list = torch.from_numpy(
118-
data_or_path_or_list
119-
).squeeze() # [n_samples,]
116+
try:
117+
import soundfile as sf
118+
data_np, audio_fs = sf.read(data_or_path_or_list, dtype="float32")
119+
data_or_path_or_list = torch.from_numpy(data_np).squeeze()
120+
if data_or_path_or_list.ndim > 1 and kwargs.get("reduce_channels", True):
121+
data_or_path_or_list = data_or_path_or_list.mean(-1)
122+
except:
123+
data_or_path_or_list = _load_audio_ffmpeg(data_or_path_or_list, sr=fs)
124+
data_or_path_or_list = torch.from_numpy(
125+
data_or_path_or_list
126+
).squeeze() # [n_samples,]
120127
elif data_type == "text" and tokenizer is not None:
121128
with open(data_or_path_or_list, "r") as f:
122129
data_or_path_or_list = tokenizer.encode(f.read().strip())

0 commit comments

Comments
 (0)