Skip to content

Commit 77fb8bb

Browse files
authored
Merge branch 'OpenDCAI:main' into main
2 parents 3308318 + b24ddad commit 77fb8bb

15 files changed

Lines changed: 431 additions & 94 deletions
60.6 KB
Binary file not shown.
73.5 KB
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"pdf_path": "../example_data/VQA/questionextract_test.pdf", "subject": "math", "output_dir": "../vqa_output_math"}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"question_pdf_path": "../example_data/VQA/math_question.pdf", "answer_pdf_path": "../example_data/VQA/math_answer.pdf", "subject": "math", "output_dir": "../vqa_output_math"}

dataflow/example/VQA/vqa_extract_test.jsonl

Lines changed: 0 additions & 1 deletion
This file was deleted.

dataflow/operators/vqa/generate/vqa_extract_doclayout.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,18 +43,17 @@ def modified_draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_confi
4343
logger.warning(f"Invalid /Rotate value: {rotation_obj!r}, defaulting to 0")
4444
rotation = 0
4545

46-
if rotation == 0:
46+
if rotation == 90:
4747
c.translate(rect[0] + rect[2] + 2, rect[1] + rect[3] - 10)
48-
elif rotation == 90:
48+
elif rotation == 0:
4949
c.translate(rect[0] + 10, rect[1] + rect[3] + 2)
5050
elif rotation == 180:
5151
c.translate(rect[0] - 2, rect[1] + 10)
5252
elif rotation == 270:
5353
c.translate(rect[0] + rect[2] - 10, rect[1] - 2)
5454

5555
c.rotate(rotation)
56-
c.drawString(0, 0, f"tag{i}:")
57-
c.drawString(0, -10, f"box{j}")
56+
c.drawString(0, 0, f"tag{i}:box{j}")
5857
c.restoreState()
5958

6059
return c
@@ -147,8 +146,8 @@ def modified_draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
147146
imgs_body_list.append(imgs_body)
148147
# imgs_caption_list.append(imgs_caption)
149148
# imgs_footnote_list.append(imgs_footnote)
150-
titles_list.append(titles)
151-
texts_list.append(texts)
149+
# titles_list.append(titles)
150+
# texts_list.append(texts)
152151
interequations_list.append(interequations)
153152
lists_list.append(lists)
154153
list_items_list.append(list_items)
@@ -162,9 +161,9 @@ def modified_draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
162161
page_block_list = []
163162
for block in page["para_blocks"]:
164163
if block["type"] in [
165-
BlockType.TEXT,
166-
BlockType.REF_TEXT,
167-
BlockType.TITLE,
164+
# BlockType.TEXT,
165+
# BlockType.REF_TEXT,
166+
# BlockType.TITLE,
168167
BlockType.INTERLINE_EQUATION,
169168
BlockType.LIST,
170169
BlockType.INDEX,
@@ -199,8 +198,8 @@ def modified_draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
199198
c = modified_draw_bbox_without_number(i, imgs_body_list, page, c, [153, 255, 51], True)
200199
# c = modified_draw_bbox_without_number(i, imgs_caption_list, page, c, [102, 178, 255], True)
201200
# c = modified_draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True)
202-
c = modified_draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)
203-
c = modified_draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
201+
# c = modified_draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)
202+
# c = modified_draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
204203
c = modified_draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
205204
c = modified_draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
206205
c = modified_draw_bbox_without_number(i, list_items_list, page, c, [40, 169, 92], False)
@@ -236,6 +235,7 @@ def __init__(self, mineru_backend: Literal["vlm-transformers","vlm-vllm-engine"]
236235

237236
def run(self, storage, input_pdf_file_path:str,
238237
output_folder:str):
238+
global cal_canvas_rect, BlockType, SplitFlag, PdfReader, PdfWriter, PageObject, canvas
239239
try:
240240
import mineru
241241
from mineru.utils.draw_bbox import cal_canvas_rect

dataflow/operators/vqa/generate/vqa_extract_pic_extractor.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,12 @@
1515
class VQAExtractPicExtractor(OperatorABC):
1616
def __init__(self,
1717
llm_serving: LLMServingABC = None,
18-
model: str = "o4-mini",
19-
subject: str = "math"
18+
interleaved: bool = True
2019
):
2120
self.logger = get_logger()
2221
self.llm_serving = llm_serving
2322
self.prompt = VQAExtractPrompt()
24-
self.model = model
25-
self.subject = subject
23+
self.interleaved = interleaved
2624

2725
def _format_instructions(self, image_files: List[str]):
2826
list_of_image_paths = []
@@ -34,7 +32,7 @@ def _format_instructions(self, image_files: List[str]):
3432
return list_of_image_paths, list_of_image_labels
3533

3634

37-
def run(self, storage, input_layout_path: str, output_folder: str):
35+
def run(self, storage, input_layout_path: str, input_subject: str, output_folder: str):
3836
# 从layout_path/images中读取所有图片的文件名,确保为绝对路径
3937
image_files = [os.path.join(input_layout_path, image_file) for image_file in os.listdir(input_layout_path)]
4038
# 确保end with jpg & png
@@ -46,9 +44,9 @@ def filename2idx(filename: str):
4644
image_files.sort(key=filename2idx)
4745

4846
list_of_image_paths, list_of_image_labels = self._format_instructions(image_files)
49-
system_prompt = self.prompt.build_prompt(self.subject)
47+
system_prompt = self.prompt.build_prompt(input_subject, interleaved=self.interleaved)
5048

51-
responses = self.llm_serving.generate_from_input_multi_images(list_of_image_paths, list_of_image_labels, system_prompt, self.model)
49+
responses = self.llm_serving.generate_from_input_multi_images(list_of_image_paths, list_of_image_labels, system_prompt)
5250

5351
# 将list of image paths和list of image labels和repsonses作为三列组织为jsonl
5452
list_of_dict = []

dataflow/operators/vqa/generate/vqa_extract_qapair_extractor.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,24 +12,45 @@ def __init__(self):
1212
def extract_qa_pairs_from_text(self, page: int, text: str):
1313
"""
1414
从一段 response 文本中提取所有 <qa_pair>…</qa_pair>
15+
格式为:<chapter><title>MAIN_TITLE</title>
16+
<qa_pair><label>…</label><question>QUESTION_TEXT<pic>…</pic>…</question>
17+
<answer>ANSWER_TEXT<pic>…</pic>…</answer></qa_pair>
18+
<qa_pair><label>…</label><question>QUESTION_TEXT<pic>…</pic>…</question>
19+
<answer>ANSWER_TEXT<pic>…</pic>…</answer></qa_pair>
20+
</chapter>
1521
并返回 [{'question': ..., 'answer': ...}, …]
1622
"""
1723
qa_list = []
18-
# 找出所有 qa_pair 块
19-
for pair in re.findall(r'<qa_pair>(.*?)</qa_pair>', text, flags=re.DOTALL):
20-
# 提取 question 部分
21-
q_match = re.search(r'<question>(.*?)</question>', pair, flags=re.DOTALL)
22-
# 提取 answer 部分
23-
a_match = re.search(r'<answer>(.*?)</answer>', pair, flags=re.DOTALL)
24-
if not (q_match and a_match):
25-
continue
26-
question = q_match.group(1).strip()
27-
answer = a_match.group(1).strip()
28-
qa_list.append({
29-
'page': page,
30-
'question': question,
31-
'answer': answer
32-
})
24+
# 提取title
25+
for chapter_block in re.findall(r'<chapter>(.*?)</chapter>', text, flags=re.DOTALL):
26+
title = re.search(r'<title>(.*?)</title>', chapter_block, flags=re.DOTALL)
27+
if title:
28+
chapter_title = title.group(1).strip()
29+
else:
30+
chapter_title = ""
31+
# 找出所有 qa_pair 块
32+
for pair in re.findall(r'<qa_pair>(.*?)</qa_pair>', chapter_block, flags=re.DOTALL):
33+
# 提取 question 部分
34+
q_match = re.search(r'<question>(.*?)</question>', pair, flags=re.DOTALL)
35+
# 提取 answer 部分
36+
a_match = re.search(r'<answer>(.*?)</answer>', pair, flags=re.DOTALL)
37+
# 提取solution部分
38+
s_match = re.search(r'<solution>(.*?)</solution>', pair, flags=re.DOTALL)
39+
# 提取label
40+
label_match = re.search(r'<label>(.*?)</label>', pair, flags=re.DOTALL)
41+
if not (q_match and a_match and label_match):
42+
continue
43+
question = q_match.group(1).strip()
44+
answer = a_match.group(1).strip()
45+
label = label_match.group(1).strip()
46+
qa_list.append({
47+
'page': page,
48+
'question': question,
49+
'answer': answer,
50+
'solution': s_match.group(1).strip() if s_match else "",
51+
'label': label,
52+
'chapter_title': chapter_title
53+
})
3354
return qa_list
3455

3556
def run(self, storage, input_vqa_extract_path: str, output_qa_path: str):

dataflow/operators/vqa/generate/vqa_extract_tag2img.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,29 +11,17 @@
1111

1212
@OPERATOR_REGISTRY.register()
1313
class VQAExtractTag2Img(OperatorABC):
14-
def __init__(self, layout_json, pdf_image_dir, output_image_dir, layout_prefix='doclay_page_', image_prefix='page_'):
14+
def __init__(self, layout_prefix='doclay_page_', image_prefix='page_'):
1515
"""
1616
初始化处理器。
17-
18-
Args:
19-
layout_json_dir (str): 存储布局检测结果的JSON文件的目录。
20-
pdf_image_dir (str): 存储从PDF转换的原始页面图片的目录。
21-
output_image_dir (str): 用于保存裁剪出的图片的目录。
2217
"""
23-
self.layout_json = layout_json
24-
self.pdf_image_dir = pdf_image_dir
25-
self.output_image_dir = output_image_dir
2618
self.layout_prefix = layout_prefix # 用于处理布局JSON文件的前缀
2719
self.image_prefix = image_prefix # 用于处理PDF图片文件的前缀
2820

2921
self.image_counter = 0 # 用于生成唯一的图片文件名
3022
self.bbox_cache = {} # 缓存已加载的JSON数据,避免重复读取文件
3123
self.logger = get_logger()
3224

33-
# 确保输出目录存在
34-
os.makedirs(self.output_image_dir, exist_ok=True)
35-
self.logger.info(f"输出图片目录 '{self.output_image_dir}' 已准备就绪。")
36-
3725
def _get_bbox(self, page_num, figure_id):
3826
"""
3927
从对应的JSON文件中获取指定figure的边界框。
@@ -66,7 +54,7 @@ def _get_bbox(self, page_num, figure_id):
6654
# 在detections中查找figure_id
6755
i = -1
6856
for detection in layout_data:
69-
if detection.get("page_idx") == int(page_num) and detection.get("type") in ["text", "ref_text", "title", "equation", "list", "index", "image", "table", "code"]:
57+
if detection.get("page_idx") == int(page_num) and detection.get("type") in ["title", "equation", "list", "index", "image", "table", "code"]:
7058
i += 1
7159
# class_name 也可以是 'figure',id 可能是 'figure1', 'figure2' 等
7260
if i == int(figure_id):
@@ -163,20 +151,34 @@ def _dump_markdown(self, processed_qas, output_md_file):
163151
for idx, qa in enumerate(processed_qas, 1):
164152
question = qa.get('question', '').strip()
165153
answer = qa.get('answer', '').strip()
154+
solution = qa.get('solution', '').strip()
166155
f_md.write(f"## question {idx}\n")
167156
f_md.write(question + "\n\n")
168157
f_md.write(f"## answer {idx}\n")
169158
f_md.write(answer + "\n\n")
159+
f_md.write(f"## solution {idx}\n")
160+
f_md.write(solution + "\n\n")
170161
self.logger.info(f"Markdown 文件已保存到: {output_md_file}")
171162
except Exception as e:
172163
self.logger.error(f"写入 Markdown 文件失败: {output_md_file}, 错误: {e}")
173164

174-
def run(self, storage, input_qa_file, output_qa_file, output_md_file=None):
165+
def run(self, storage, input_layout_json, input_pdf_image_dir, output_image_dir, input_qa_file, output_qa_file, output_md_file=None):
175166
"""
167+
Args:
168+
layout_json_dir (str): 存储布局检测结果的JSON文件的目录。
169+
pdf_image_dir (str): 存储从PDF转换的原始页面图片的目录。
170+
output_image_dir (str): 用于保存裁剪出的图片的目录。
176171
处理包含 QA 对的 JSON Lines 文件,并输出:
177172
1) 处理后的 JSON Lines 文件 (output_qa_file)
178173
2) 可选的 Markdown 文件 (output_md_file),按 ## question i ... ## answer i ... 格式
179174
"""
175+
self.layout_json = input_layout_json
176+
self.pdf_image_dir = input_pdf_image_dir
177+
self.output_image_dir = output_image_dir
178+
# 确保输出目录存在
179+
os.makedirs(self.output_image_dir, exist_ok=True)
180+
self.logger.info(f"输出图片目录 '{self.output_image_dir}' 已准备就绪。")
181+
180182
processed_qas = []
181183
# —— 读取 & 处理 JSON Lines —— #
182184
try:
@@ -188,6 +190,8 @@ def run(self, storage, input_qa_file, output_qa_file, output_md_file=None):
188190
qa_item['question'] = self.process_text(qa_item['question'])
189191
if 'answer' in qa_item and isinstance(qa_item['answer'], str):
190192
qa_item['answer'] = self.process_text(qa_item['answer'])
193+
if 'solution' in qa_item and isinstance(qa_item['solution'], str):
194+
qa_item['solution'] = self.process_text(qa_item['solution'])
191195
processed_qas.append(qa_item)
192196
except json.JSONDecodeError:
193197
self.logger.error(f"跳过无效的JSON行: {line.strip()}")

0 commit comments

Comments
 (0)