HeRunming
diff --git a/‎dataflow/cli_funcs/cli_pdf.py‎
Lines changed: 59 additions & 11 deletions b/‎dataflow/cli_funcs/cli_pdf.py‎
Lines changed: 59 additions & 11 deletions
diff --git a/‎dataflow/core/prompt.py‎
Lines changed: 48 additions & 8 deletions b/‎dataflow/core/prompt.py‎
Lines changed: 48 additions & 8 deletions
diff --git a/‎dataflow/example/core_text_data/double_column_input.json‎
Lines changed: 18 additions & 0 deletions b/‎dataflow/example/core_text_data/double_column_input.json‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎dataflow/operators/chemistry/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎dataflow/operators/chemistry/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎…hemistry/eval/eval_smiles_equivalence.py‎ ‎…/smiles_equivalence_dataset_evaluator.py‎dataflow/operators/chemistry/eval/eval_smiles_equivalence.py renamed to dataflow/operators/chemistry/eval/smiles_equivalence_dataset_evaluator.py
Lines changed: 1 addition & 1 deletion b/‎…hemistry/eval/eval_smiles_equivalence.py‎ ‎…/smiles_equivalence_dataset_evaluator.py‎dataflow/operators/chemistry/eval/eval_smiles_equivalence.py renamed to dataflow/operators/chemistry/eval/smiles_equivalence_dataset_evaluator.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎…try/generate/extract_smiles_from_text.py‎ ‎…te/extract_smiles_from_text_generator.py‎dataflow/operators/chemistry/generate/extract_smiles_from_text.py renamed to dataflow/operators/chemistry/generate/extract_smiles_from_text_generator.py
Lines changed: 9 additions & 2 deletions b/‎…try/generate/extract_smiles_from_text.py‎ ‎…te/extract_smiles_from_text_generator.py‎dataflow/operators/chemistry/generate/extract_smiles_from_text.py renamed to dataflow/operators/chemistry/generate/extract_smiles_from_text_generator.py
Lines changed: 9 additions & 2 deletions
diff --git a/‎dataflow/operators/code/eval/code_quality_sample_evaluator.py‎
Lines changed: 7 additions & 1 deletion b/‎dataflow/operators/code/eval/code_quality_sample_evaluator.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎dataflow/operators/code/generate/code_code_to_instruction_generator.py‎
Lines changed: 7 additions & 1 deletion b/‎dataflow/operators/code/generate/code_code_to_instruction_generator.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎dataflow/operators/code/generate/code_gen_instruction.py‎
Lines changed: 9 additions & 3 deletions b/‎dataflow/operators/code/generate/code_gen_instruction.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎dataflow/operators/code/generate/code_instruction_enhancement.py‎
Lines changed: 7 additions & 1 deletion b/‎dataflow/operators/code/generate/code_instruction_enhancement.py‎
Lines changed: 7 additions & 1 deletion
@@ -199,7 +199,6 @@ def check_required_files():
     # 检查所有需要的内置脚本
     required_scripts = [
         "path_to_jsonl_script.py",
-        "merge_filter_qa_pairs.py",
         "llama_factory_trainer.py"
     ]
 
@@ -321,31 +320,82 @@ def cli_pdf2model_train(lf_yaml: str = ".cache/train_config.yaml", cache_path: s
     print("-" * 60)
 
     try:
-        # Step 1: PDF Detection - 使用内置脚本
+        # Step 1: PDF Detection
         script1_path = get_dataflow_script_path("path_to_jsonl_script.py")
         args1 = ["./", "--output", str(cache_path_obj / ".cache" / "gpu" / "pdf_list.jsonl")]
         if not run_script_with_args(script1_path, "Step 1: PDF Detection", args1, cwd=str(current_dir)):
             return False
 
-        # Step 2: Data Processing - 使用用户目录下的脚本
+        # Step 2: Data Processing
         script2 = current_dir / "pdf_to_qa_pipeline.py"
         args2 = ["--cache", cache_path]
         if not run_script_with_args(script2, "Step 2: Data Processing", args2, cwd=str(current_dir)):
             return False
 
-        # Step 3: Data Conversion - 使用内置脚本
-        script3_path = get_dataflow_script_path("merge_filter_qa_pairs.py")
-        args3 = ["--cache", cache_path]
-        if not run_script_with_args(script3_path, "Step 3: Data Conversion", args3, cwd=str(current_dir)):
+        # Step 2.5: Create dataset_info.json (dynamically)
+        print(f"\n{Fore.BLUE}Step 2.5: Creating dataset_info.json{Style.RESET_ALL}")
+
+        # 读取训练配置，获取数据集名称
+        try:
+            with open(config_path_obj, 'r', encoding='utf-8') as f:
+                train_config = yaml.safe_load(f)
+            
+            # 获取数据集名称
+            dataset_name = train_config.get('dataset')
+            if isinstance(dataset_name, list):
+                dataset_name = dataset_name[0]  # 如果是列表，取第一个
+            
+            if not dataset_name:
+                print("Warning: No dataset name found in train_config.yaml, using default 'kb_qa'")
+                dataset_name = 'kb_qa'
+            
+            print(f"Dataset name from config: {dataset_name}")
+            
+        except Exception as e:
+            print(f"Warning: Could not read train_config.yaml: {e}")
+            print("Using default dataset name: kb_qa")
+            dataset_name = 'kb_qa'
+
+        # 创建 dataset_info.json
+        dataset_info_path = cache_path_obj / ".cache" / "data" / "dataset_info.json"
+        dataset_info_path.parent.mkdir(parents=True, exist_ok=True)
+
+        dataset_info = {
+            dataset_name: {  # ← 使用从配置读取的名称
+                "file_name": "qa.json",
+                "formatting": "alpaca",
+                "columns": {
+                    "prompt": "instruction",
+                    "query": "input",
+                    "response": "output"
+                }
+            }
+        }
+
+        with open(dataset_info_path, 'w', encoding='utf-8') as f:
+            json.dump(dataset_info, f, indent=2, ensure_ascii=False)
+
+        print(f"Created: {dataset_info_path}")
+        print(f"Dataset registered as: {dataset_name}")
+        print(f"{Fore.GREEN}✅ Step 2.5: Creating dataset_info.json completed{Style.RESET_ALL}")
+
+        # Step 3: Data Conversion - skip
+        print(f"\n{Fore.BLUE}Step 3: Data Conversion{Style.RESET_ALL}")
+        qa_json_path = cache_path_obj / ".cache" / "data" / "qa.json"
+        if qa_json_path.exists():
+            print(f"✅ qa.json already in correct format, skipping conversion")
+            print(f"{Fore.GREEN}✅ Step 3: Data Conversion completed{Style.RESET_ALL}")
+        else:
+            print(f"❌ qa.json not found at {qa_json_path}")
             return False
 
-        # Step 4: Training - 使用内置脚本
+        # Step 4: Training
         script4_path = get_dataflow_script_path("llama_factory_trainer.py")
         args4 = ["--config", str(config_path_obj), "--cache", cache_path]
         if not run_script_with_args(script4_path, "Step 4: Training", args4, cwd=str(current_dir)):
             return False
 
-        # 显示训练完成信息，从配置文件中读取实际的输出目录
+        # Show completion info
         try:
             with open(config_path_obj, 'r', encoding='utf-8') as f:
                 config = yaml.safe_load(f)
@@ -367,8 +417,6 @@ def cli_pdf2model_train(lf_yaml: str = ".cache/train_config.yaml", cache_path: s
 
 def cli_pdf2model_chat(model_path=None, cache_path="./", base_model=None):
     """Start LlamaFactory chat interface"""
-    print("Starting chat interface...")
-
     current_dir = Path(os.getcwd())
 
     # 处理cache路径
 
@@ -1,5 +1,6 @@
 from typing import TypeVar, Protocol, Union, get_type_hints,cast
 from functools import wraps
+import inspect
 # from dataflow.core import OperatorABC
 
 class PromptABC():
@@ -34,22 +35,33 @@ def decorator(cls:T) -> T:
         # self.ALLOWED_PROMPTS = list(allowed_prompts)
 
         orig_init = cls.__init__
+        sig = inspect.signature(orig_init)  # 在装饰时就解析一次签名，避免每次实例化重复解析
+        if "prompt_template" not in sig.parameters:
+            # 若类的 __init__ 根本没有该形参，就仅维持注解/属性设置，不做运行时检查
+            # （你也可以选择在这里直接 raise 来强制类必须声明该参数）
+            pass
 
         @wraps(orig_init)
         def new_init(self, *args, **kwargs):
-            pt = kwargs.get("prompt_template", None)
-            # if pt is None and len(args) > 1:
-            #     pt = args[1]
+            # 用签名绑定实参：自动把位置/关键字/默认值对齐到参数名
+            try:
+                bound = sig.bind_partial(self, *args, **kwargs)
+                bound.apply_defaults()
+            except TypeError:
+                # 参数不完整或不匹配时，交给原始 __init__ 去报错更合适
+                return orig_init(self, *args, **kwargs)
+
+            pt = bound.arguments.get("prompt_template", None)
 
             if pt is not None and not isinstance(pt, cls.ALLOWED_PROMPTS):
                 if not isinstance(pt, DIYPromptABC):
-                    # 每个类的完整 import 路径，换行分隔
                     allowed_names = "\n".join(
                         f"  - {c.__module__}.{c.__qualname__}"
                         for c in cls.ALLOWED_PROMPTS
                     )
                     raise TypeError(
-                        f"[{cls.__name__}] Invalid prompt_template type: {type(pt).__module__}.{type(pt).__qualname__}\n"
+                        f"[{cls.__name__}] Invalid prompt_template type: "
+                        f"{type(pt).__module__}.{type(pt).__qualname__}\n"
                         f"Expected one of:\n{allowed_names}\n"
                         f"or a custom subclass of `dataflow.core.prompt.DIYPromptABC.`"
                     )
@@ -58,10 +70,38 @@ def new_init(self, *args, **kwargs):
 
         cls.__init__ = new_init
 
-        # 更新类型注解（运行时可见，get_type_hints 可解析）
+        # 保持你原本的注解暴露逻辑
         cls.__annotations__ = dict(getattr(cls, "__annotations__", {}))
         cls.__annotations__["prompt_template"] = _make_diyprompt_union(allowed_prompts)
 
-        # return cast(T, cast(OperatorWithAllowedPrompts, cls))
         return cls
-    return decorator
+    return decorator
+
+
+if __name__ == "__main__":
+    import pytest
+
+    class A(PromptABC): pass
+    class B(PromptABC): pass
+    class MyDIY(DIYPromptABC): pass
+    class Other(PromptABC): pass
+
+    @prompt_restrict(A, B)
+    class Op:
+        def __init__(self, prompt_template=None):
+            self.prompt_template = prompt_template
+
+    # 关键字参数：允许
+    Op(prompt_template=A())
+    Op(prompt_template=B())
+    Op(prompt_template=MyDIY())
+    Op()  # None 允许
+
+    # 位置参数：同样被检测
+    Op(A())        # ✅
+    Op(MyDIY())    # ✅
+    with pytest.raises(TypeError):
+        Op(Other())  # ❌ 非白名单且非 DIY
+
+    with pytest.raises(TypeError):
+        Op(object())  # ❌ 完全无关类型
@@ -0,0 +1,18 @@
+[
+    {
+        "roll": "pig",
+        "term": "eat"
+    },
+    {
+        "roll": "tiger",
+        "term": "chase"
+    },
+    {
+        "roll": "people",
+        "term": "drink"
+    },
+    {
+        "roll": "bird",
+        "term": "dance"
+    }
+]
@@ -1,8 +1,8 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from generate.extract_smiles_from_text import ExtractSmilesFromText
-    from eval.eval_smiles_equivalence import EvaluateSmilesEquivalence
+    from generate.extract_smiles_from_text_generator import ExtractSmilesFromTextGenerator
+    from eval.smiles_equivalence_dataset_evaluator import SmilesEquivalenceDatasetEvaluator
 else:
     import sys
     from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking
 
@@ -8,7 +8,7 @@
 import json
 
 @OPERATOR_REGISTRY.register()
-class EvaluateSmilesEquivalence(OperatorABC):
+class SmilesEquivalenceDatasetEvaluator(OperatorABC):
     """
     对每个块（row）里的 golden_label 与 synth_smiles 进行 SMILES 等价性评估：
     - 以 abbreviation 对齐
 
@@ -10,13 +10,20 @@
 import json
 import re
 
+from typing import Union
+from dataflow.core.prompt import prompt_restrict, DIYPromptABC
+from dataflow.prompts.chemistry import ExtractSmilesFromTextPrompt
 
+
+@prompt_restrict(
+    ExtractSmilesFromTextPrompt
+)
 @OPERATOR_REGISTRY.register()
-class ExtractSmilesFromText(OperatorABC):
+class ExtractSmilesFromTextGenerator(OperatorABC):
     '''
     Answer Generator is a class that generates answers for given questions.
     '''
-    def __init__(self, llm_serving: LLMServingABC, prompt_template = None):
+    def __init__(self, llm_serving: LLMServingABC, prompt_template: Union[ExtractSmilesFromTextPrompt, DIYPromptABC] = ExtractSmilesFromTextPrompt):
         self.logger = get_logger()
         self.llm_serving = llm_serving
         self.prompt_template = prompt_template
 
@@ -8,8 +8,14 @@
 from dataflow.utils.storage import DataFlowStorage
 from dataflow.core import OperatorABC
 from dataflow.core import LLMServingABC
+from dataflow.core.prompt import prompt_restrict, DIYPromptABC
 from dataflow.prompts.code import CodeQualityEvaluatorPrompt, DiyCodePrompt
 
+from typing import Union
+@prompt_restrict(
+    CodeQualityEvaluatorPrompt,
+    DiyCodePrompt
+)
 @OPERATOR_REGISTRY.register()
 class CodeQualitySampleEvaluator(OperatorABC):
     """
@@ -18,7 +24,7 @@ class CodeQualitySampleEvaluator(OperatorABC):
     and textual feedback, acting as an automated code reviewer.
     """
 
-    def __init__(self, llm_serving: LLMServingABC, prompt_template=None):
+    def __init__(self, llm_serving: LLMServingABC, prompt_template: Union[CodeQualityEvaluatorPrompt, DiyCodePrompt, DIYPromptABC] = None):
         """
         Initializes the operator with a language model serving endpoint.
         """
 
@@ -9,6 +9,12 @@
 from dataflow.core import LLMServingABC
 from dataflow.prompts.code import CodeCodeToInstructionGeneratorPrompt, DiyCodePrompt
 
+from typing import Union
+from dataflow.core.prompt import prompt_restrict, DIYPromptABC
+@prompt_restrict(
+    CodeCodeToInstructionGeneratorPrompt,
+    DiyCodePrompt
+)
 @OPERATOR_REGISTRY.register()
 class CodeCodeToInstructionGenerator(OperatorABC):
     """
@@ -17,7 +23,7 @@ class CodeCodeToInstructionGenerator(OperatorABC):
     'self-instruct' style data synthesis pipeline for code.
     """
 
-    def __init__(self, llm_serving: LLMServingABC, prompt_template=None):
+    def __init__(self, llm_serving: LLMServingABC, prompt_template: Union[CodeCodeToInstructionGeneratorPrompt, DiyCodePrompt, DIYPromptABC] = None):
         """
         Initializes the operator with a language model serving endpoint.
         """
 
@@ -7,8 +7,14 @@
 from dataflow.utils.storage import DataFlowStorage
 from dataflow.core import OperatorABC 
 from dataflow.core import LLMServingABC
-from dataflow.prompts.code import CodeInstructionGenerate, DiyCodePrompt
+from dataflow.prompts.code import CodeInstructionGeneratePrompt, DiyCodePrompt
 
+from typing import Union
+from dataflow.core.prompt import prompt_restrict, DIYPromptABC 
+
+@prompt_restrict(
+    CodeInstructionGeneratePrompt,
+)
 @OPERATOR_REGISTRY.register()
 class CodeInstructionGenerator(OperatorABC):
     """
@@ -19,7 +25,7 @@ class CodeInstructionGenerator(OperatorABC):
     and enhance instruction datasets for programming tasks.
     """
 
-    def __init__(self, llm_serving: LLMServingABC, prompt_template=None, num_few_shot: int = 3, num_generate: int = 10):
+    def __init__(self, llm_serving: LLMServingABC, prompt_template: Union[CodeInstructionGeneratePrompt, DIYPromptABC]=None, num_few_shot: int = 3, num_generate: int = 10):
         """
         Initializes the operator with a language model serving endpoint.
         
@@ -32,7 +38,7 @@ def __init__(self, llm_serving: LLMServingABC, prompt_template=None, num_few_sho
         self.num_generate = num_generate
         self.llm_serving = llm_serving
         self.num_few_shot = num_few_shot
-        self.prompt_template = CodeInstructionGenerate()
+        self.prompt_template = CodeInstructionGeneratePrompt()
 
     @staticmethod
     def get_desc(lang: str = "en"):
 
@@ -8,6 +8,12 @@
 from dataflow.core import LLMServingABC
 from dataflow.prompts.code import CodeInstructionEnhancement, DiyCodePrompt
 
+from typing import Union
+from dataflow.core.prompt import prompt_restrict, DIYPromptABC
+@prompt_restrict(
+    CodeInstructionEnhancement,
+    DiyCodePrompt
+)
 @OPERATOR_REGISTRY.register()
 class CodeEnhancementInstructionGenerator(OperatorABC):
     """
@@ -16,7 +22,7 @@ class CodeEnhancementInstructionGenerator(OperatorABC):
     It rewrites original instructions into standardized English instruction + code block format.
     """
 
-    def __init__(self, llm_serving: LLMServingABC, prompt_template=None):
+    def __init__(self, llm_serving: LLMServingABC, prompt_template: Union[CodeInstructionEnhancement, DiyCodePrompt, DIYPromptABC] = None):
         """
         Initializes the operator with a language model serving endpoint.
         """