-
Notifications
You must be signed in to change notification settings - Fork 26
Description
Thanks for all this great dLLM. I'm now test some infers using the weight in https://huggingface.co/diffusionfamily/diffullama/tree/main. But I run into this problem:
Traceback (most recent call last):
File "/data1/aether.wu/DiffuLLaMA/inf_diffullama.py", line 61, in
res = generate_samples(model, args, tokenizer, inputs, verbose=args.verbose)
File "/data1/aether.wu/DiffuLLaMA/model.py", line 110, in generate_samples
logits = model(xt, attention_mask=attention_mask)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/data1/aether.wu/DiffuLLaMA/model.py", line 74, in forward
x = self.denoise_model(inputs_embeds = x_embed, attention_mask=attention_mask, return_dict = False)[0]
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/data1/aether.wu/DiffuLLaMA/attention_patch.py", line 421, in forward_llama2
layer_outputs = decoder_layer(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 734, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 556, in forward
attn_output = _flash_attention_forward(
File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_flash_attention_utils.py", line 252, in _flash_attention_forward
attn_output_unpad = flash_attn_varlen_func(
File "/usr/local/lib/python3.10/dist-packages/flash_attn/flash_attn_interface.py", line 1443, in flash_attn_varlen_func
return FlashAttnVarlenFunc.apply(
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 575, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/usr/local/lib/python3.10/dist-packages/flash_attn/flash_attn_interface.py", line 925, in forward
out_padded, softmax_lse, S_dmask, rng_state = _wrapped_flash_attn_varlen_forward(
File "/usr/local/lib/python3.10/dist-packages/torch/_ops.py", line 1123, in call
return self._op(*args, **(kwargs or {}))
File "/usr/local/lib/python3.10/dist-packages/torch/_library/autograd.py", line 113, in autograd_impl
result = forward_no_grad(*args, Metadata(keyset, keyword_only_args))
File "/usr/local/lib/python3.10/dist-packages/torch/_library/autograd.py", line 40, in forward_no_grad
result = op.redispatch(keyset & _C._after_autograd_keyset, *args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/_ops.py", line 728, in redispatch
return self._handle.redispatch_boxed(keyset, *args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/_library/custom_ops.py", line 305, in backend_impl
result = self._backend_fns[device_type](*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/_compile.py", line 32, in inner
return disable_fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py", line 745, in _fn
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/_library/custom_ops.py", line 337, in wrapped_fn
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/flash_attn/flash_attn_interface.py", line 165, in _flash_attn_varlen_forward
out, softmax_lse, S_dmask, rng_state = flash_attn_gpu.varlen_fwd(
RuntimeError: cu_seqlens_q must have shape (batch_size + 1)`
It seems like some vector alignment issue. By the way, I'm using the inf_diffullama.py and model_llama.py under the folder 'DiffuLLaMA-training'. And my version : transformers==4.44.2 flash-attn==2.6.3 Thank you!