Skip to content

Commit 440568a

Browse files
author
sangchengmeng
committed
add-audio
1 parent e9af3d8 commit 440568a

3 files changed

Lines changed: 42 additions & 47 deletions

File tree

lightllm/models/qwen3_omni_moe_thinker/model.py

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -33,53 +33,43 @@ def _get_feat_extract_output_lengths(input_lengths):
3333
return output_lengths
3434

3535

36-
# <|audio_start|><|audio_pad|><|audio_end|>
37-
AUDIO_START_TOKEN = "<|audio_start|>"
38-
AUDIO_END_TOKEN = "<|audio_end|>"
39-
AUDIO_TOKEN_TOKEN = "<|audio_pad|>"
4036
MIN_AUDIO_LEN = 480
4137

4238

4339
class QWen3OmniTokenizer(QWen3VLTokenizer):
44-
def __init__(self, tokenizer=None, image_processor=None, **kwargs):
40+
def __init__(self, tokenizer=None, processor=None, **kwargs):
4541
self.tokenizer = tokenizer
46-
self.image_processor = image_processor
42+
# image
43+
self.image_processor = processor.image_processor
4744
self.min_pixel = self.image_processor.min_pixels
4845
self.max_pixel = self.image_processor.max_pixels
4946
self.patch_size = self.image_processor.patch_size
5047
self.merge_size = self.image_processor.merge_size
48+
49+
# audio
50+
self.audio_processor = processor.feature_extractor
51+
self.sampling_rate = self.audio_processor.sampling_rate
52+
self.n_samples = self.audio_processor.n_samples
53+
self.hop_length = self.audio_processor.hop_length
54+
5155
self.image_start_id = kwargs["model_cfg"]["vision_start_token_id"]
5256
self.image_end_id = kwargs["model_cfg"]["vision_end_token_id"]
5357
self.image_token_id = kwargs["model_cfg"]["image_token_id"]
5458

55-
self.audio_start_tag = AUDIO_START_TOKEN
56-
self.audio_start_id = tokenizer.convert_tokens_to_ids(self.audio_start_tag)
57-
58-
self.audio_end_tag = AUDIO_END_TOKEN
59-
self.audio_end_id = tokenizer.convert_tokens_to_ids(self.audio_end_tag)
60-
61-
self.audio_token_tag = AUDIO_TOKEN_TOKEN
62-
self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token_tag)
63-
64-
# 这些太hard了, 后面改一下,可以直接从audio_processor里取?
65-
self.sampling_rate = 16000
66-
self.chunk_length = 30
67-
self.n_samples = self.chunk_length * self.sampling_rate
68-
self.hop_length = 160
59+
self.audio_start_id = kwargs["model_cfg"]["audio_start_token_id"]
60+
self.audio_end_id = kwargs["model_cfg"]["audio_end_token_id"]
61+
self.audio_token_id = kwargs["model_cfg"]["audio_token_id"]
6962

7063
def init_audioitem_extral_params(
7164
self, audio: AudioItem, multi_params: MultimodalParams, sampling_params: SamplingParams
7265
):
7366
return
7467

7568
def get_audio_token_length(self, audio: AudioItem):
76-
# audio_bytes = audio._preload_data
77-
# audio_values, _ = librosa.load(BytesIO(audio_bytes), sr=self.sampling_rate)
78-
# length = max(int(audio_values.shape[0]), int(MIN_AUDIO_LEN)) #这个最短还有必要吗?稍等再检查一下
79-
# L_eff = min(length, int(self.n_samples))
80-
# num_frames = L_eff // int(self.hop_length)
81-
82-
return 290
69+
length = min(audio.audio_length, int(self.n_samples))
70+
token_num = length // int(self.hop_length)
71+
print(f"token_num is {token_num}")
72+
return token_num
8373

8474
def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs):
8575
origin_ids = self.tokenizer.encode(prompt)

lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -340,33 +340,37 @@ def forward(
340340
def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedCacheClient):
341341
uuids = []
342342
items: List[AudioItem] = []
343+
per_audio_features: List[torch.Tensor] = []
343344
for i, item in enumerate(audio_items):
344345
if isinstance(item, AudioItem):
345346
uuids.append(item.uuid)
346347
items.append(item)
347348
audio_data = read_shm(get_shm_name_data(item.uuid))
348349
audio = BytesIO(audio_data)
349-
audio, _ = librosa.load(audio, sr=16000)
350+
audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
350351
else:
351352
raise ValueError(f"cannot read audio which type is {type(item)}!")
352353

353-
input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
354-
print(f"input_features is {input_features}, input_features.shape is {input_features.shape}")
355-
print(f"feature_attention_mask is {feature_attention_mask}, shape is {feature_attention_mask.shape}")
356-
if feature_attention_mask is not None:
357-
audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
358-
input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
359-
else:
360-
audio_feature_lengths = None
361-
print(f"input_features is {input_features}, input_features.shape is {input_features.shape}")
362-
363-
feature_lens = audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
364-
print(f"feature_lens is {feature_lens}")
365-
audio_features = self.forward(
366-
input_features,
367-
feature_lens=feature_lens,
368-
)
369-
print(f"audio_features is {audio_features}, shape is {audio_features.shape}")
354+
input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
355+
print(f"input_features is {input_features}, input_features.shape is {input_features.shape}")
356+
print(f"feature_attention_mask is {feature_attention_mask}, shape is {feature_attention_mask.shape}")
357+
if feature_attention_mask is not None:
358+
audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
359+
input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
360+
else:
361+
audio_feature_lengths = None
362+
print(f"input_features is {input_features}, input_features.shape is {input_features.shape}")
363+
364+
feature_lens = (
365+
audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
366+
)
367+
print(f"feature_lens is {feature_lens}")
368+
audio_features = self.forward(
369+
input_features,
370+
feature_lens=feature_lens,
371+
)
372+
per_audio_features.append(audio_features)
373+
print(f"audio_features is {audio_features}, shape is {audio_features.shape}")
370374

371375
ready_audio = obtain(self.cache_client.root.get_items_embed(uuids))
372376
ids_to_set = []
@@ -377,8 +381,9 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
377381
uid = uuids[i]
378382
item = items[i]
379383

384+
cur_embed = per_audio_features[i]
380385
cpu_embed_cache_client.copy_to_cache(
381-
embed_tensor=audio_features, start_index_in_cache=item.start_index_in_embed_cache
386+
embed_tensor=cur_embed, start_index_in_cache=item.start_index_in_embed_cache
382387
)
383388
ids_to_set.append(uid)
384389

lightllm/server/tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def get_tokenizer(
106106

107107
model_cfg = model_cfg["thinker_config"]
108108
processor = AutoProcessor.from_pretrained(tokenizer_name)
109-
tokenizer = QWen3OmniTokenizer(tokenizer, image_processor=processor.image_processor, model_cfg=model_cfg)
109+
tokenizer = QWen3OmniTokenizer(tokenizer, processor=processor, model_cfg=model_cfg)
110110
elif model_type == "internvl_chat":
111111
tokenizer = InternvlTokenizer(tokenizer, model_cfg, weight_dir=tokenizer_name)
112112
elif model_type == "gemma3":

0 commit comments

Comments
 (0)