Skip to content

Commit 3696ef0

Browse files
committed
[pdf2vqa] 适配api返回格式
1 parent 9e20c01 commit 3696ef0

3 files changed

Lines changed: 18 additions & 7 deletions

File tree

dataflow/operators/pdf2vqa/generate/llm_output_parser.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,12 @@ def run(self, storage: DataFlowStorage,
122122
outfile.write('\n')
123123

124124
# 复制图片
125-
src_dir = os.path.join(self.intermediate_dir, 'mineru', Path(converted_json_path).stem).replace('_content_list_converted','')
125+
src_dir = converted_json_path.rpartition('/')[0]
126126
src_images = os.path.join(src_dir, 'vlm', 'images')
127+
if not os.path.exists(src_images):
128+
src_images = os.path.join(src_dir, 'images')
129+
if not os.path.exists(src_images):
130+
raise ValueError("Images directory not found! There might be a change in Mineru API!")
127131
dst_images = os.path.join(self.output_dir, image_prefix)
128132

129133
try:

dataflow/operators/pdf2vqa/generate/mineru_to_llm_input_operator.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from dataflow.utils.registry import OPERATOR_REGISTRY
44
from dataflow.utils.storage import DataFlowStorage
55

6+
from pathlib import Path
7+
68
@OPERATOR_REGISTRY.register()
79
class MinerU2LLMInputOperator(OperatorABC):
810
def __init__(self):
@@ -57,8 +59,13 @@ def run(self, storage: DataFlowStorage,
5759
dataframe = storage.read("dataframe")
5860

5961
for index, row in dataframe.iterrows():
60-
input_json_path = row[input_markdown_path_key].replace('.md', '_content_list.json')
61-
converted_path = input_json_path.replace('.json', '_converted.json')
62+
md_path = Path(row[input_markdown_path_key])
63+
try:
64+
input_json_path = list(md_path.parent.glob("*_content_list.json"))[0]
65+
except:
66+
raise ValueError("No _content_list.json file found in the api result. There might be an error with the Mineru api.")
67+
68+
converted_path = str(input_json_path).replace('.json', '_converted.json')
6269
self._convert_json(input_json_path, converted_path)
6370
dataframe.at[index, output_converted_layout_key] = converted_path
6471

dataflow/statics/pipelines/api_pipelines/pdf_vqa_extract_pipeline.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from dataflow.operators.knowledge_cleaning import FileOrURLToMarkdownConverterFlash
1+
from dataflow.operators.knowledge_cleaning import FileOrURLToMarkdownConverterFlash, FileOrURLToMarkdownConverterLocal, FileOrURLToMarkdownConverterAPI
22

33
from dataflow.serving import APILLMServing_request
44
from dataflow.utils.storage import FileStorage
@@ -30,7 +30,7 @@ def __init__(self):
3030
self.vqa_extract_prompt = QAExtractPrompt()
3131

3232
self.pdf_merger = PDF_Merger(output_dir="./cache")
33-
self.mineru_executor = FileOrURLToMarkdownConverterBatch(intermediate_dir = "intermediate", mineru_backend="vlm-vllm-engine")
33+
self.mineru_executor = FileOrURLToMarkdownConverterAPI(intermediate_dir = "intermediate")
3434
self.input_formatter = MinerU2LLMInputOperator()
3535
self.vqa_extractor = ChunkedPromptedGenerator(
3636
llm_serving=self.llm_serving,
@@ -42,9 +42,9 @@ def __init__(self):
4242
def forward(self):
4343
self.pdf_merger.run(
4444
storage=self.storage.step(),
45-
input_pdf_path_key="input_pdf_paths",
45+
input_pdf_list_key="input_pdf_paths",
4646
input_name_key="name",
47-
output_merged_pdf_path_key="merged_pdf_path",
47+
output_pdf_path_key="merged_pdf_path",
4848
)
4949
self.mineru_executor.run(
5050
storage=self.storage.step(),

0 commit comments

Comments
 (0)