Skip to content

Commit 690d3da

Browse files
committed
[pdf2vqa] minor fix
1 parent 7ff3dfb commit 690d3da

File tree

3 files changed

+8
-8
lines changed

3 files changed

+8
-8
lines changed

dataflow/operators/pdf2vqa/generate/llm_output_parser.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def _id_to_text(self, input_ids, input_json, image_prefix="images"):
6767

6868
def _convert_response(self, input_response, input_json_path, image_prefix="images"):
6969
qa_list = []
70-
with open(input_json_path, 'r') as infile:
70+
with open(input_json_path, 'r', encoding='utf-8') as infile:
7171
input_json = list(json.load(infile))
7272
# 提取title
7373
for chapter_block in re.findall(r'<chapter>(.*?)</chapter>', input_response, flags=re.DOTALL):
@@ -116,18 +116,18 @@ def run(self, storage: DataFlowStorage,
116116
qa_list = self._convert_response(response, converted_json_path, image_prefix)
117117
output_qalist_path = os.path.join(self.output_dir, name, f"extracted_vqa.jsonl")
118118
os.makedirs(os.path.dirname(output_qalist_path), exist_ok=True)
119-
with open(output_qalist_path, 'w') as outfile:
119+
with open(output_qalist_path, 'w', encoding='utf-8') as outfile:
120120
for qa in qa_list:
121121
json.dump(qa, outfile, ensure_ascii=False)
122122
outfile.write('\n')
123123

124124
# 复制图片
125-
src_dir = converted_json_path.rpartition('/')[0]
125+
src_dir = os.path.dirname(converted_json_path)
126126
src_images = os.path.join(src_dir, 'vlm', 'images')
127127
if not os.path.exists(src_images):
128128
src_images = os.path.join(src_dir, 'images')
129129
if not os.path.exists(src_images):
130-
raise ValueError("Images directory not found! There might be a change in Mineru API!")
130+
raise ValueError(f"Images directory {src_images} not found! There might be a change in Mineru API!")
131131
dst_images = os.path.join(self.output_dir, image_prefix)
132132

133133
try:

dataflow/operators/pdf2vqa/generate/mineru_to_llm_input_operator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def get_desc(lang: str = "zh") -> str:
2626
)
2727

2828
def _convert_json(self, input_file, output_file):
29-
with open(input_file, 'r') as infile:
29+
with open(input_file, 'r', encoding="utf-8") as infile:
3030
data = list(json.load(infile))
3131

3232
new_data = []
@@ -49,7 +49,7 @@ def _convert_json(self, input_file, output_file):
4949
new_data.append(item)
5050
id += 1
5151

52-
with open(output_file, 'w') as outfile:
52+
with open(output_file, 'w', encoding='utf-8') as outfile:
5353
json.dump(new_data, outfile, ensure_ascii=False)
5454

5555
def run(self, storage: DataFlowStorage,
@@ -69,7 +69,7 @@ def run(self, storage: DataFlowStorage,
6969
self._convert_json(input_json_path, converted_path)
7070
dataframe.at[index, output_converted_layout_key] = converted_path
7171

72-
with open(converted_path, 'r') as infile:
72+
with open(converted_path, 'r', encoding='utf-8') as infile:
7373
data = json.load(infile)
7474
assert isinstance(data, list), f"Expected list, got {type(data)} for {input_json_path}"
7575

dataflow/statics/pipelines/api_pipelines/pdf_vqa_extract_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from dataflow.operators.knowledge_cleaning import FileOrURLToMarkdownConverterFlash, FileOrURLToMarkdownConverterLocal, FileOrURLToMarkdownConverterAPI
1+
from dataflow.operators.knowledge_cleaning import FileOrURLToMarkdownConverterAPI
22

33
from dataflow.serving import APILLMServing_request
44
from dataflow.utils.storage import FileStorage

0 commit comments

Comments
 (0)