HeRunming
diff --git a/‎dataflow/example/VQA/math_answer.pdf‎
60.6 KB b/‎dataflow/example/VQA/math_answer.pdf‎
60.6 KB
diff --git a/‎dataflow/example/VQA/math_question.pdf‎
73.5 KB b/‎dataflow/example/VQA/math_question.pdf‎
73.5 KB
diff --git a/‎dataflow/example/VQA/vqa_extract_interleaved_test.jsonl‎
Lines changed: 1 addition & 0 deletions b/‎dataflow/example/VQA/vqa_extract_interleaved_test.jsonl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎dataflow/example/VQA/vqa_extract_long_distance_test.jsonl‎
Lines changed: 1 addition & 0 deletions b/‎dataflow/example/VQA/vqa_extract_long_distance_test.jsonl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎dataflow/example/VQA/vqa_extract_test.jsonl‎
Lines changed: 0 additions & 1 deletion b/‎dataflow/example/VQA/vqa_extract_test.jsonl‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎dataflow/operators/vqa/generate/vqa_extract_doclayout.py‎
Lines changed: 11 additions & 11 deletions b/‎dataflow/operators/vqa/generate/vqa_extract_doclayout.py‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎dataflow/operators/vqa/generate/vqa_extract_pic_extractor.py‎
Lines changed: 5 additions & 7 deletions b/‎dataflow/operators/vqa/generate/vqa_extract_pic_extractor.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎dataflow/operators/vqa/generate/vqa_extract_qapair_extractor.py‎
Lines changed: 36 additions & 15 deletions b/‎dataflow/operators/vqa/generate/vqa_extract_qapair_extractor.py‎
Lines changed: 36 additions & 15 deletions
diff --git a/‎dataflow/operators/vqa/generate/vqa_extract_tag2img.py‎
Lines changed: 19 additions & 15 deletions b/‎dataflow/operators/vqa/generate/vqa_extract_tag2img.py‎
Lines changed: 19 additions & 15 deletions
@@ -0,0 +1 @@
+{"pdf_path": "../example_data/VQA/questionextract_test.pdf", "subject": "math", "output_dir": "../vqa_output_math"}
@@ -0,0 +1 @@
+{"question_pdf_path": "../example_data/VQA/math_question.pdf", "answer_pdf_path": "../example_data/VQA/math_answer.pdf", "subject": "math", "output_dir": "../vqa_output_math"}
@@ -43,18 +43,17 @@ def modified_draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_confi
             logger.warning(f"Invalid /Rotate value: {rotation_obj!r}, defaulting to 0")
             rotation = 0
 
-        if rotation == 0:
+        if rotation == 90:
             c.translate(rect[0] + rect[2] + 2, rect[1] + rect[3] - 10)
-        elif rotation == 90:
+        elif rotation == 0:
             c.translate(rect[0] + 10, rect[1] + rect[3] + 2)
         elif rotation == 180:
             c.translate(rect[0] - 2, rect[1] + 10)
         elif rotation == 270:
             c.translate(rect[0] + rect[2] - 10, rect[1] - 2)
 
         c.rotate(rotation)
-        c.drawString(0, 0, f"tag{i}:")
-        c.drawString(0, -10, f"box{j}")
+        c.drawString(0, 0, f"tag{i}:box{j}")
         c.restoreState()
 
     return c
@@ -147,8 +146,8 @@ def modified_draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
         imgs_body_list.append(imgs_body)
         # imgs_caption_list.append(imgs_caption)
         # imgs_footnote_list.append(imgs_footnote)
-        titles_list.append(titles)
-        texts_list.append(texts)
+        # titles_list.append(titles)
+        # texts_list.append(texts)
         interequations_list.append(interequations)
         lists_list.append(lists)
         list_items_list.append(list_items)
@@ -162,9 +161,9 @@ def modified_draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
         page_block_list = []
         for block in page["para_blocks"]:
             if block["type"] in [
-                BlockType.TEXT,
-                BlockType.REF_TEXT,
-                BlockType.TITLE,
+                # BlockType.TEXT,
+                # BlockType.REF_TEXT,
+                # BlockType.TITLE,
                 BlockType.INTERLINE_EQUATION,
                 BlockType.LIST,
                 BlockType.INDEX,
@@ -199,8 +198,8 @@ def modified_draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
         c = modified_draw_bbox_without_number(i, imgs_body_list, page, c, [153, 255, 51], True)
         # c = modified_draw_bbox_without_number(i, imgs_caption_list, page, c, [102, 178, 255], True)
         # c = modified_draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True)
-        c = modified_draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)
-        c = modified_draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
+        # c = modified_draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)
+        # c = modified_draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
         c = modified_draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
         c = modified_draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
         c = modified_draw_bbox_without_number(i, list_items_list, page, c, [40, 169, 92], False)
@@ -236,6 +235,7 @@ def __init__(self, mineru_backend: Literal["vlm-transformers","vlm-vllm-engine"]
 
     def run(self, storage, input_pdf_file_path:str,
                         output_folder:str):
+        global cal_canvas_rect, BlockType, SplitFlag, PdfReader, PdfWriter, PageObject, canvas
         try:
             import mineru
             from mineru.utils.draw_bbox import cal_canvas_rect
 
@@ -15,14 +15,12 @@
 class VQAExtractPicExtractor(OperatorABC):
     def __init__(self,
                 llm_serving: LLMServingABC = None,
-                model: str = "o4-mini",
-                subject: str = "math"
+                interleaved: bool = True
                 ):
         self.logger = get_logger()
         self.llm_serving = llm_serving
         self.prompt = VQAExtractPrompt()
-        self.model = model
-        self.subject = subject
+        self.interleaved = interleaved
 
     def _format_instructions(self, image_files: List[str]):
         list_of_image_paths = []
@@ -34,7 +32,7 @@ def _format_instructions(self, image_files: List[str]):
         return list_of_image_paths, list_of_image_labels
 
 
-    def run(self, storage, input_layout_path: str, output_folder: str):
+    def run(self, storage, input_layout_path: str, input_subject: str, output_folder: str):
         # 从layout_path/images中读取所有图片的文件名,确保为绝对路径
         image_files = [os.path.join(input_layout_path, image_file) for image_file in os.listdir(input_layout_path)]
         # 确保end with jpg & png
@@ -46,9 +44,9 @@ def filename2idx(filename: str):
         image_files.sort(key=filename2idx)
 
         list_of_image_paths, list_of_image_labels = self._format_instructions(image_files)
-        system_prompt = self.prompt.build_prompt(self.subject)
+        system_prompt = self.prompt.build_prompt(input_subject, interleaved=self.interleaved)
 
-        responses = self.llm_serving.generate_from_input_multi_images(list_of_image_paths, list_of_image_labels, system_prompt, self.model)
+        responses = self.llm_serving.generate_from_input_multi_images(list_of_image_paths, list_of_image_labels, system_prompt)
 
         # 将list of image paths和list of image labels和repsonses作为三列组织为jsonl
         list_of_dict = []
 
@@ -12,24 +12,45 @@ def __init__(self):
     def extract_qa_pairs_from_text(self, page: int, text: str):
         """
         从一段 response 文本中提取所有 <qa_pair>…</qa_pair>
+        格式为：<chapter><title>MAIN_TITLE</title>
+        <qa_pair><label>…</label><question>QUESTION_TEXT<pic>…</pic>…</question>
+        <answer>ANSWER_TEXT<pic>…</pic>…</answer></qa_pair>
+        <qa_pair><label>…</label><question>QUESTION_TEXT<pic>…</pic>…</question>
+        <answer>ANSWER_TEXT<pic>…</pic>…</answer></qa_pair>
+        </chapter>
         并返回 [{'question': ..., 'answer': ...}, …]
         """
         qa_list = []
-        # 找出所有 qa_pair 块
-        for pair in re.findall(r'<qa_pair>(.*?)</qa_pair>', text, flags=re.DOTALL):
-            # 提取 question 部分
-            q_match = re.search(r'<question>(.*?)</question>', pair, flags=re.DOTALL)
-            # 提取 answer 部分
-            a_match = re.search(r'<answer>(.*?)</answer>', pair, flags=re.DOTALL)
-            if not (q_match and a_match):
-                continue
-            question = q_match.group(1).strip()
-            answer = a_match.group(1).strip()
-            qa_list.append({
-                'page': page,
-                'question': question,
-                'answer': answer
-            })
+        # 提取title
+        for chapter_block in re.findall(r'<chapter>(.*?)</chapter>', text, flags=re.DOTALL):
+            title = re.search(r'<title>(.*?)</title>', chapter_block, flags=re.DOTALL)
+            if title:
+                chapter_title = title.group(1).strip()
+            else:
+                chapter_title = ""
+            # 找出所有 qa_pair 块
+            for pair in re.findall(r'<qa_pair>(.*?)</qa_pair>', chapter_block, flags=re.DOTALL):
+                # 提取 question 部分
+                q_match = re.search(r'<question>(.*?)</question>', pair, flags=re.DOTALL)
+                # 提取 answer 部分
+                a_match = re.search(r'<answer>(.*?)</answer>', pair, flags=re.DOTALL)
+                # 提取solution部分
+                s_match = re.search(r'<solution>(.*?)</solution>', pair, flags=re.DOTALL)
+                # 提取label
+                label_match = re.search(r'<label>(.*?)</label>', pair, flags=re.DOTALL)
+                if not (q_match and a_match and label_match):
+                    continue
+                question = q_match.group(1).strip()
+                answer = a_match.group(1).strip()
+                label = label_match.group(1).strip()
+                qa_list.append({
+                    'page': page,
+                    'question': question,
+                    'answer': answer,
+                    'solution': s_match.group(1).strip() if s_match else "",
+                    'label': label,
+                    'chapter_title': chapter_title
+                })
         return qa_list
 
     def run(self, storage, input_vqa_extract_path: str, output_qa_path: str):
 
@@ -11,29 +11,17 @@
 
 @OPERATOR_REGISTRY.register()
 class VQAExtractTag2Img(OperatorABC):
-    def __init__(self, layout_json, pdf_image_dir, output_image_dir, layout_prefix='doclay_page_', image_prefix='page_'):
+    def __init__(self, layout_prefix='doclay_page_', image_prefix='page_'):
         """
         初始化处理器。
-
-        Args:
-            layout_json_dir (str): 存储布局检测结果的JSON文件的目录。
-            pdf_image_dir (str): 存储从PDF转换的原始页面图片的目录。
-            output_image_dir (str): 用于保存裁剪出的图片的目录。
         """
-        self.layout_json = layout_json
-        self.pdf_image_dir = pdf_image_dir
-        self.output_image_dir = output_image_dir
         self.layout_prefix = layout_prefix  # 用于处理布局JSON文件的前缀
         self.image_prefix = image_prefix    # 用于处理PDF图片文件的前缀
 
         self.image_counter = 0  # 用于生成唯一的图片文件名
         self.bbox_cache = {}    # 缓存已加载的JSON数据，避免重复读取文件
         self.logger = get_logger()
 
-        # 确保输出目录存在
-        os.makedirs(self.output_image_dir, exist_ok=True)
-        self.logger.info(f"输出图片目录 '{self.output_image_dir}' 已准备就绪。")
-
     def _get_bbox(self, page_num, figure_id):
         """
         从对应的JSON文件中获取指定figure的边界框。
@@ -66,7 +54,7 @@ def _get_bbox(self, page_num, figure_id):
             # 在detections中查找figure_id
             i = -1
             for detection in layout_data:
-                if detection.get("page_idx") == int(page_num) and detection.get("type") in ["text", "ref_text", "title", "equation", "list", "index", "image", "table", "code"]:
+                if detection.get("page_idx") == int(page_num) and detection.get("type") in ["title", "equation", "list", "index", "image", "table", "code"]:
                     i += 1
                 # class_name 也可以是 'figure'，id 可能是 'figure1', 'figure2' 等
                     if i == int(figure_id):
@@ -163,20 +151,34 @@ def _dump_markdown(self, processed_qas, output_md_file):
                 for idx, qa in enumerate(processed_qas, 1):
                     question = qa.get('question', '').strip()
                     answer = qa.get('answer', '').strip()
+                    solution = qa.get('solution', '').strip()
                     f_md.write(f"## question {idx}\n")
                     f_md.write(question + "\n\n")
                     f_md.write(f"## answer {idx}\n")
                     f_md.write(answer + "\n\n")
+                    f_md.write(f"## solution {idx}\n")
+                    f_md.write(solution + "\n\n")
             self.logger.info(f"Markdown 文件已保存到: {output_md_file}")
         except Exception as e:
             self.logger.error(f"写入 Markdown 文件失败: {output_md_file}, 错误: {e}")
 
-    def run(self, storage, input_qa_file, output_qa_file, output_md_file=None):
+    def run(self, storage, input_layout_json, input_pdf_image_dir, output_image_dir, input_qa_file, output_qa_file, output_md_file=None):
         """
+        Args:
+            layout_json_dir (str): 存储布局检测结果的JSON文件的目录。
+            pdf_image_dir (str): 存储从PDF转换的原始页面图片的目录。
+            output_image_dir (str): 用于保存裁剪出的图片的目录。
         处理包含 QA 对的 JSON Lines 文件，并输出：
         1) 处理后的 JSON Lines 文件 (output_qa_file)
         2) 可选的 Markdown 文件 (output_md_file)，按 ## question i ... ## answer i ... 格式
         """
+        self.layout_json = input_layout_json
+        self.pdf_image_dir = input_pdf_image_dir
+        self.output_image_dir = output_image_dir
+        # 确保输出目录存在
+        os.makedirs(self.output_image_dir, exist_ok=True)
+        self.logger.info(f"输出图片目录 '{self.output_image_dir}' 已准备就绪。")
+        
         processed_qas = []
         # —— 读取 & 处理 JSON Lines —— #
         try:
@@ -188,6 +190,8 @@ def run(self, storage, input_qa_file, output_qa_file, output_md_file=None):
                             qa_item['question'] = self.process_text(qa_item['question'])
                         if 'answer' in qa_item and isinstance(qa_item['answer'], str):
                             qa_item['answer'] = self.process_text(qa_item['answer'])
+                        if 'solution' in qa_item and isinstance(qa_item['solution'], str):
+                            qa_item['solution'] = self.process_text(qa_item['solution'])
                         processed_qas.append(qa_item)
                     except json.JSONDecodeError:
                         self.logger.error(f"跳过无效的JSON行: {line.strip()}")
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"pdf_path": "../example_data/VQA/questionextract_test.pdf", "subject": "math", "output_dir": "../vqa_output_math"}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"question_pdf_path": "../example_data/VQA/math_question.pdf", "answer_pdf_path": "../example_data/VQA/math_answer.pdf", "subject": "math", "output_dir": "../vqa_output_math"}`