Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion dataflow/cli_funcs/pdf2model_pipeline/pdf_to_qa_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path
from dataflow.operators.knowledge_cleaning import (
KBCChunkGeneratorBatch,
FileOrURLToMarkdownConverterLocal,
FileOrURLToMarkdownConverterFlash,
KBCTextCleanerBatch,
KBCMultiHopQAGeneratorBatch,
Expand All @@ -29,11 +30,24 @@ def __init__(self, cache_base="./"):
cache_type="json",
)

self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterLocal(
intermediate_dir=str(cache_path / ".cache"),
mineru_backend="vlm-vllm-engine", # 可选 pipeline, vlm-vllm-engine, vlm-vllm-transformer, vlm-http-client
)

# Faster backend by Flash-MinerU
# See de

# self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
# intermediate_dir="../example_data/KBCleaningPipeline/flash/",
# mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
# # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
# batch_size=4, # batchsize per vllm worker
# replicas=1, # num of vllm workers
# num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
# engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
# )

self.knowledge_cleaning_step2 = KBCChunkGeneratorBatch(
split_method="token",
chunk_size=512,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def __init__(self,
intermediate_dir: str = "intermediate",
mineru_backend: str = "vlm-auto-engine",
mineru_source: str = "local",
mienru_model_path:str = None,
mineru_model_path:str = None,
mineru_download_model_type:str = "vlm"
):
"""
Expand All @@ -346,7 +346,7 @@ def __init__(self,
"""
super().__init__(intermediate_dir, mineru_backend)
self.mineru_source = mineru_source
self.mienru_model_path = mienru_model_path
self.mineru_model_path = mineru_model_path
self.mineru_download_model_type = mineru_download_model_type

@staticmethod
Expand All @@ -371,7 +371,7 @@ def get_desc(lang: str = "zh"):
"- intermediate_dir: 中间产物目录(默认 intermediate)\n"
"- mineru_backend: MinerU CLI 后端(默认 vlm-auto-engine;也可 pipeline / vlm-sglang-engine 等)\n"
"- mineru_source: 模型来源(默认 local;对应 MINERU_MODEL_SOURCE)\n"
"- mienru_model_path: 本地模型目录;提供则会调用 configure_model 配置模型\n"
"- mineru_model_path: 本地模型目录;提供则会调用 configure_model 配置模型\n"
"- mineru_download_model_type: 配置模型类型(默认 vlm)\n\n"
"运行参数(run):\n"
"- storage: DataFlowStorage,需包含 dataframe\n"
Expand Down Expand Up @@ -448,8 +448,8 @@ def _batch_parse_pdf_with_mineru(self, pdf_files: list):
os.environ.setdefault("MINERU_MODEL_SOURCE", self.mineru_source)

# load local model and config corresponding files https://github.com/opendatalab/MinerU/blob/a12610fb3e9e24488fe3e76cd233ba88ec64bbaf/mineru/cli/models_download.py#L19
if self.mienru_model_path != None:
configure_model(self.mienru_model_path, self.mineru_download_model_type)
if self.mineru_model_path != None:
configure_model(self.mineru_model_path, self.mineru_download_model_type)

parsed_results = {}
for item in pdf_files:
Expand Down
27 changes: 15 additions & 12 deletions dataflow/statics/pipelines/api_pipelines/kbcleaning_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,27 +29,30 @@ def __init__(self):
# ------------case1: use MinerU official API (by default) ------------
# by default we use API provided by MinerU official
# https://mineru.net/apiManage/docs
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterAPI(
intermediate_dir="../example_data/KBCleaningPipeline/API/",
mineru_backend="vlm", # vlm or pipeline
api_key=None # !!! place your api key here or set environment variable MINERU_API_KEY!!!
)
# self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterAPI(
# intermediate_dir="../example_data/KBCleaningPipeline/API/",
# mineru_backend="vlm", # vlm or pipeline
# api_key=None # !!! place your api key here or set environment variable MINERU_API_KEY!!!
# )
# ------------case2: use Flash-MinerU inference locally with GPU ------------
# https://github.com/OpenDCAI/Flash-MinerU
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
engine_gpu_util_rate_to_ray_cap=0.5
batch_size=4, # batchsize per vllm worker
replicas=1, # num of vllm workers
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on GPU
)

# ------------case3: use MinerU official inference locally (much slower than other two) ------------
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterLocal(
intermediate_dir="../example_data/KBCleaningPipeline/local/",
mineru_backend="vlm-local-engine",
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",
)
# self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterLocal(
# intermediate_dir="../example_data/KBCleaningPipeline/local/",
# mineru_backend="vlm-local-engine",
# # https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
# mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B",
# )

self.knowledge_cleaning_step2 = KBCChunkGenerator(
split_method="token",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,15 @@ def __init__(self):

self.vqa_extract_prompt = QAExtractPrompt()

self.mineru_executor = FileOrURLToMarkdownConverterFlash(intermediate_dir = "intermediate", mineru_backend="vlm-vllm-engine")
self.mineru_executor = FileOrURLToMarkdownConverterFlash(
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
batch_size=4, # batchsize per vllm worker
replicas=1, # num of vllm workers
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
)
self.input_formatter = MinerU2LLMInputOperator()
self.vqa_extractor = ChunkedPromptedGenerator(
llm_serving=self.llm_serving,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,14 @@ def __init__(self):
)

self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
mineru_backend="vlm-vllm-engine",
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
batch_size=4, # batchsize per vllm worker
replicas=1, # num of vllm workers
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
)

self.knowledge_cleaning_step2 = KBCChunkGeneratorBatch(
split_method="token",
chunk_size=512,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,13 @@ def __init__(self):
)

self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
mineru_backend="vlm-vllm-engine",
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
batch_size=4, # batchsize per vllm worker
replicas=1, # num of vllm workers
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
)

self.knowledge_cleaning_step2 = KBCChunkGeneratorBatch(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,14 @@ def __init__(self):
)

self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
mineru_backend="vlm-vllm-engine",
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
batch_size=4, # batchsize per vllm worker
replicas=1, # num of vllm workers
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
)

self.knowledge_cleaning_step2 = KBCChunkGenerator(
split_method="token",
chunk_size=512,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,13 @@ def __init__(self):
)

self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
mineru_backend="vlm-vllm-engine",
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
batch_size=4, # batchsize per vllm worker
replicas=1, # num of vllm workers
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
)

self.knowledge_cleaning_step2 = KBCChunkGenerator(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,13 @@ def __init__(self):
)

self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
lang="en",
mineru_backend="vlm-vllm-engine",
intermediate_dir="../example_data/KBCleaningPipeline/flash/",
mineru_model_path="<your Model Path>/MinerU2.5-2509-1.2B", # !!! place your local model path here !!!
# https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B.
batch_size=4, # batchsize per vllm worker
replicas=1, # num of vllm workers
num_gpus_per_replica=0.5, # for ray to schedule vllm workers to GPU, can be float, e.g. 0.5 means each worker uses half GPU, 1 means each worker uses whole GPU
engine_gpu_util_rate_to_ray_cap=0.9 # actuall GPU utilization for each worker; acturall memory per worker= num_gpus_per_replica * engine_gpu_util_rate_to_ray_cap; this is to avoid OOM, you can set it to 0.9 or 0.8 to leave some buffer for other processes on
)

self.knowledge_cleaning_step2 = KBCChunkGenerator(
Expand Down