diff --git a/README.md b/README.md index a0e89a9..abe6e98 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ docker build -t ming:py310-cu121 docker/docker-py310-cu121 ``` At last, start the container with the current repo directory mounted: ```shell -docker run -it --gpus all -v "$(pwd)":/workspace/Ming ming:py310-cu121 ming:py310-cu121 /bin/bash +docker run -it --gpus all -v "$(pwd)":/workspace/Ming ming:py310-cu121 /bin/bash ``` You can run the model with python interface. You may download the huggingface model in the repo directory first (`.../Ming/`) or mount the downloaded model path when starting the container. @@ -219,6 +219,41 @@ pip install gradio_client python gradio_demo.py ``` +## LLaMA-Factory Usage + +We provide a LLaMA-Factory tools to facilitate the use of Ming-lite-omni. + +1. First, make sure your environment is ready for Ming. Please clone the original code of this repository and download Ming-Lite-Omni-1.5 from [🤗 HuggingFace](https://huggingface.co/inclusionAI/Ming-Lite-Omni-1.5) or [🤖 ModelScope](https://www.modelscope.cn/models/inclusionAI/Ming-Lite-Omni-1.5) following the official installation guidance above. + ```shell + Ming + ├── am.mvn + ├── audio_detokenizer + ├── ... + ├── inclutionAI + │ ├── Ming-Lite-Omni-1.5 + │ │ ├── config.json + │ │ ├── ... + ``` + +2. Download the original code of `LLaMA-Factory==0.9.3`. Apply the patch for the compatibility of Ming and install LLaMA-factory with the official installation guide. + ```shell + git clone https://github.com/hiyouga/LLaMA-Factory.git + cd LLaMA-Factory + git checkout v0.9.3 + + cp /ming.patch . + git apply ming.patch + pip install . + ``` + +3. Now you can fine-tune Ming with llama-factory. You can change `ming_lora_sft.yaml` to ``. + ```shell + cd + llamafactory-cli train ming_lora_sft.yaml + ``` + +4. You can change the `model_name_or_path` and `output_dir` to your own model path or output path. You can also uncomment the `eval` part to enable evaluation. +For more details for llama-factory usage, please refer to the original [Github Repository](https://github.com/hiyouga/LLaMA-Factory). ## License and Legal Disclaimer diff --git a/ming.patch b/ming.patch new file mode 100644 index 0000000..4e3c16f --- /dev/null +++ b/ming.patch @@ -0,0 +1,273 @@ +diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py +index adaaaa87..bd16f78f 100644 +--- a/src/llamafactory/chat/hf_engine.py ++++ b/src/llamafactory/chat/hf_engine.py +@@ -204,6 +204,9 @@ class HuggingfaceEngine(BaseEngine): + gen_kwargs["audio_feature_lens"] = mm_inputs["audio_feature_lens"] + + gen_kwargs.pop("image_sizes", None) ++ if getattr(model.config, "model_type", None) in ["bailingmm"]: ++ gen_kwargs["input_ids"] = inputs ++ del gen_kwargs["inputs"] + + return gen_kwargs, prompt_length + +diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py +index 3ac9c307..12c66079 100644 +--- a/src/llamafactory/data/mm_plugin.py ++++ b/src/llamafactory/data/mm_plugin.py +@@ -892,6 +892,187 @@ class LlavaNextVideoPlugin(BasePlugin): + return messages + + ++@dataclass ++class MingOmniPlugin(BasePlugin): ++ def _validate_input( ++ self, ++ processor: Optional["MMProcessor"], ++ images: list["ImageInput"], ++ videos: list["VideoInput"], ++ audios: list["AudioInput"], ++ ) -> None: ++ r"""Validate if this model accepts the input modalities.""" ++ image_processor: BaseImageProcessor = getattr(processor, "image_processor", None) ++ video_processor: BaseImageProcessor = getattr(processor, "image_processor", None) ++ audio_processor = getattr(processor, "audio_processor", None) ++ if len(images) != 0 and self.image_token is None: ++ raise ValueError( ++ "This model does not support image input. Please check whether the correct `template` is used." ++ ) ++ ++ if len(videos) != 0 and self.video_token is None: ++ raise ValueError( ++ "This model does not support video input. Please check whether the correct `template` is used." ++ ) ++ ++ if len(audios) != 0 and self.audio_token is None: ++ raise ValueError( ++ "This model does not support audio input. Please check whether the correct `template` is used." ++ ) ++ ++ if self.image_token is not None and processor is None: ++ raise ValueError("Processor was not found, please check and update your model file.") ++ ++ if self.image_token is not None and image_processor is None: ++ raise ValueError("Image processor was not found, please check and update your model file.") ++ ++ if self.video_token is not None and video_processor is None: ++ raise ValueError("Video processor was not found, please check and update your model file.") ++ ++ if self.audio_token is not None and audio_processor is None: ++ raise ValueError("Audio feature extractor was not found, please check and update your model file.") ++ ++ @override ++ def _get_mm_inputs( ++ self, ++ images: list["ImageInput"], ++ videos: list["VideoInput"], ++ audios: list["AudioInput"], ++ processor: "MMProcessor", ++ ) -> dict[str, "torch.Tensor"]: ++ image_processor: BaseImageProcessor = getattr(processor, "image_processor", None) ++ audio_processor = getattr(processor, "audio_processor", None) ++ mm_inputs = {} ++ if len(images) != 0: ++ images = self._regularize_images( ++ images, ++ image_max_pixels=getattr(image_processor, "max_pixels", 2007040), ++ image_min_pixels=getattr(image_processor, "min_pixels", 78400), ++ )["images"] ++ mm_inputs.update(image_processor(images=images, videos=None, return_tensors="pt")) ++ ++ if len(videos) != 0: ++ videos = self._regularize_videos( ++ videos, ++ image_max_pixels=getattr(image_processor, "max_pixels_video", 768 * 28 * 28), ++ image_min_pixels=getattr(image_processor, "min_pixels_video", 128 * 28 * 28), ++ video_fps=getattr(image_processor, "video_fps", 2.0), ++ video_maxlen=getattr(image_processor, "video_maxlen", 128), ++ )["videos"] ++ # Ming can only deal with even frames. ++ videos = [video[:-1] if len(video) % 2 else video for video in videos] ++ mm_inputs.update(image_processor(images=None, videos=videos, do_resize=True, return_tensors="pt")) ++ ++ # if len(audios) != 0: ++ # sampling_rate = 16000 ++ # audios = self._regularize_audios(audios, sampling_rate=sampling_rate)["audios"] ++ # audios = [(torch.tensor(audio), sampling_rate)for audio in audios] ++ # mm_inputs.update( ++ # audio_processor( ++ # audios, ++ # padding="max_length", ++ # use_whisper_encoder=False, ++ # return_tensors="pt", ++ # ) ++ # ) ++ ++ return mm_inputs ++ ++ @override ++ def process_messages( ++ self, ++ messages: list[dict[str, str]], ++ images: list["ImageInput"], ++ videos: list["VideoInput"], ++ audios: list["AudioInput"], ++ processor: Optional["MMProcessor"], ++ ) -> list[dict[str, str]]: ++ self._validate_input(processor, images, videos, audios) ++ self._validate_messages(messages, images, videos, audios) ++ messages = deepcopy(messages) ++ image_processor: BaseImageProcessor = getattr(processor, "image_processor") ++ image_inputs, video_inputs, audio_inputs = {}, {}, {} ++ ++ if len(images): ++ image_inputs = self._get_mm_inputs(images, [], [], processor) ++ image_grid_thw = image_inputs["image_grid_thw"] ++ ++ if len(videos): ++ # assert len(videos) <= 1, "Video count must be at most 1!" ++ video_inputs = self._get_mm_inputs([], videos, [], processor) ++ video_grid_thw = video_inputs["video_grid_thw"] ++ ++ # if len(audios): ++ # audio_inputs = self._get_mm_inputs([], [], audios, processor) ++ # audio_feats_lengths = audio_inputs["encoder_feats_lengths"] ++ ++ if self.expand_mm_tokens and image_inputs: ++ idx = 0 ++ num_query_token = torch.prod(image_grid_thw, dim=1) // 4 ++ for index, message in enumerate(messages): ++ content = message["content"] ++ ++ image_tags = re.findall(IMAGE_PLACEHOLDER, content) ++ text_chunks = content.split(IMAGE_PLACEHOLDER) ++ final_text = "" ++ for i in range(len(image_tags)): ++ final_text = ( ++ final_text ++ + text_chunks[i] ++ + "" ++ + "" * num_query_token[idx] ++ + "" + "\n" ++ ) ++ idx += 1 ++ ++ final_text += text_chunks[-1] ++ messages[index]["content"] = final_text ++ ++ if self.expand_mm_tokens and video_inputs: ++ idx = 0 ++ num_query_token = torch.prod(video_grid_thw, dim=1) // 4 ++ for index, message in enumerate(messages): ++ content = message["content"] ++ ++ video_tags = re.findall(VIDEO_PLACEHOLDER, content) ++ text_chunks = content.split(VIDEO_PLACEHOLDER) ++ final_text = "" ++ for i in range(len(video_tags)): ++ final_text = ( ++ final_text ++ + text_chunks[i] ++ + "" + "\n" ++ ) ++ idx += 1 ++ ++ final_text += text_chunks[-1] ++ messages[index]["content"] = final_text ++ ++ if self.expand_mm_tokens and audio_inputs: ++ idx = 0 ++ for index, message in enumerate(messages): ++ content = message["content"] ++ ++ audio_tags = re.findall(AUDIO_PLACEHOLDER, content) ++ text_chunks = content.split(AUDIO_PLACEHOLDER) ++ final_text = "" ++ for i in range(len(audio_tags)): ++ final_text = ( ++ final_text ++ + text_chunks[i] ++ + "") ++ idx += 1 ++ ++ final_text += text_chunks[-1] ++ messages[index]["content"] = final_text ++ ++ return messages ++ ++ + @dataclass + class MiniCPMVPlugin(BasePlugin): + @override +@@ -1724,6 +1905,7 @@ PLUGINS = { + "llava": LlavaPlugin, + "llava_next": LlavaNextPlugin, + "llava_next_video": LlavaNextVideoPlugin, ++ "ming_omni": MingOmniPlugin, + "minicpm_v": MiniCPMVPlugin, + "mllama": MllamaPlugin, + "paligemma": PaliGemmaPlugin, +diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py +index ac19b30b..adc4bf71 100644 +--- a/src/llamafactory/data/template.py ++++ b/src/llamafactory/data/template.py +@@ -1390,6 +1390,18 @@ register_template( + ) + + ++# copied from bailing template ++register_template( ++ name="ming", ++ format_user=StringFormatter(slots=["HUMAN{{content}}ASSISTANT"]), ++ format_system=StringFormatter(slots=["SYSTEM{{content}}"]), ++ format_observation=StringFormatter(slots=["OBSERVATION{{content}}ASSISTANT"]), ++ stop_words=["<|endoftext|>"], ++ efficient_eos=True, ++ mm_plugin=get_mm_plugin(name="ming_omni", audio_token="