Skip to content

Commit 0946ffc

Browse files
Lavender1SunnyHaze
andauthored
[MinerU Ops] integrate 3 mineru operators for clarity (#473)
* integrate 3 mineru operators * lazy load flash_mineru * [op] update mineru OPs: update get_desc; update param for __init__; fit flash-mineru v0.0.2 * [statics] revise name of mineru OPs; update all script to fit mineru update. --------- Co-authored-by: Sunnyhaze <mxch1122@126.com>
1 parent 62d3760 commit 0946ffc

13 files changed

Lines changed: 677 additions & 607 deletions

dataflow/cli_funcs/pdf2model_pipeline/pdf_to_qa_pipeline.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from pathlib import Path
66
from dataflow.operators.knowledge_cleaning import (
77
KBCChunkGeneratorBatch,
8-
FileOrURLToMarkdownConverterBatch,
8+
FileOrURLToMarkdownConverterFlash,
99
KBCTextCleanerBatch,
1010
KBCMultiHopQAGeneratorBatch,
1111
QAExtractor
@@ -29,7 +29,7 @@ def __init__(self, cache_base="./"):
2929
cache_type="json",
3030
)
3131

32-
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterBatch(
32+
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterFlash(
3333
intermediate_dir=str(cache_path / ".cache"),
3434
mineru_backend="vlm-vllm-engine", # 可选 pipeline, vlm-vllm-engine, vlm-vllm-transformer, vlm-http-client
3535
)

dataflow/operators/knowledge_cleaning/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
from .generate.kbc_chunk_generator import KBCChunkGenerator
66
from .generate.kbc_chunk_generator_batch import KBCChunkGeneratorBatch
77
# from .generate.file_or_url_to_markdown_converter import FileOrURLToMarkdownConverter
8-
from .generate.file_or_url_to_markdown_converter_batch import FileOrURLToMarkdownConverterBatch
9-
from .generate.file_or_url_to_markdown_converter_api import FileOrURLToMarkdownConverterAPI
8+
from .generate.mineru_operators import FileOrURLToMarkdownConverterLocal
9+
from .generate.mineru_operators import FileOrURLToMarkdownConverterAPI
10+
from .generate.mineru_operators import FileOrURLToMarkdownConverterFlash
1011
from .generate.kbc_text_cleaner import KBCTextCleaner
1112
from .generate.kbc_text_cleaner_batch import KBCTextCleanerBatch
1213
from .generate.mathbook_question_extract import MathBookQuestionExtract

dataflow/operators/knowledge_cleaning/generate/file_or_url_to_markdown_converter_api.py

Lines changed: 0 additions & 298 deletions
This file was deleted.

0 commit comments

Comments
 (0)