From 858a36390e4c20df2f518e8c2e38f84a2218b493 Mon Sep 17 00:00:00 2001 From: asd765973346 Date: Sat, 14 Feb 2026 23:34:28 +0800 Subject: [PATCH] add new webagent page --- docs/.vuepress/notes/en/guide.ts | 3 +- docs/.vuepress/notes/zh/guide.ts | 3 +- docs/en/notes/guide/agent/web_collection.md | 372 ++++++++++++++++++++ docs/zh/notes/guide/agent/web_collection.md | 372 ++++++++++++++++++++ 4 files changed, 748 insertions(+), 2 deletions(-) create mode 100644 docs/en/notes/guide/agent/web_collection.md create mode 100644 docs/zh/notes/guide/agent/web_collection.md diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts index 28fd604cc..ea87e71f5 100644 --- a/docs/.vuepress/notes/en/guide.ts +++ b/docs/.vuepress/notes/en/guide.ts @@ -121,7 +121,8 @@ export const Guide: ThemeNote = defineNoteConfig({ "operator_qa", "operator_write", "pipeline_prompt", - "pipeline_rec&refine" + "pipeline_rec&refine", + "web_collection" ] }, ], diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts index ec8307e9a..852f2377e 100644 --- a/docs/.vuepress/notes/zh/guide.ts +++ b/docs/.vuepress/notes/zh/guide.ts @@ -120,7 +120,8 @@ export const Guide: ThemeNote = defineNoteConfig({ "operator_qa", "operator_write", "pipeline_prompt", - "pipeline_rec&refine" + "pipeline_rec&refine", + "web_collection" ] }, // { diff --git a/docs/en/notes/guide/agent/web_collection.md b/docs/en/notes/guide/agent/web_collection.md new file mode 100644 index 000000000..d0e17e66d --- /dev/null +++ b/docs/en/notes/guide/agent/web_collection.md @@ -0,0 +1,372 @@ +--- +title: Web Data Collection +createTime: 2026/02/14 00:00:00 +permalink: /en/guide/agent/web_collection/ +--- + +## 1. Overview + +**Web Collection Agent** is the intelligent data collection module in DataFlow-Agent, designed to automatically collect, process, and format training datasets from the internet. The system supports two data types: + +- **PT (Pre-Training)**: Large-scale unlabeled corpora for model pre-training. +- **SFT (Supervised Fine-Tuning)**: Structured instruction-response pairs for model fine-tuning. + +The workflow is capable of: + +1. **Web Search & Exploration**: Multi-layer BFS forest exploration strategy with LLM-driven URL filtering to automatically discover and locate target datasets. +2. **Multi-Platform Download**: Supports HuggingFace, Kaggle, and direct web download, with LLM intelligently deciding the download priority order. +3. **Dual-Channel Parallel Collection**: WebSearch and WebCrawler pipelines run in parallel, providing richer data sources. +4. **Adaptive Data Mapping**: LLM generates Python mapping functions with a triple-verification mechanism to automatically convert heterogeneous data into standard Alpaca format. + +## 2. System Architecture + +This function is orchestrated by `dataflow_agent/workflow/wf_web_collection.py`, forming a directed graph with parallel branches and conditional loops. The overall process is divided into four phases: task analysis, data collection (parallel), data download, and data processing & mapping. + +### 2.1 Task Analysis Phase + +1. **Start Node** + 1. **Responsibility**: Initializes the workflow configuration, creates the download directory, and prepares the execution environment. + 2. **Input**: `state.request.target` (user's original requirement). + 3. **Output**: Initialized `user_query` and download directory. + +2. **Task Decomposer** + 1. **Responsibility**: Uses LLM to decompose complex user requirements into executable subtasks, with a maximum task limit (default 5). + 2. **Input**: User's original query. + 3. **LLM Thinking**: Analyzes the semantic meaning of the requirement and splits it into independent data collection subtasks. + 4. **Output**: `state.task_list`, for example: + - Subtask 1: Collect NLP Q&A datasets + - Subtask 2: Collect text classification datasets + - Subtask 3: Collect image classification datasets + +3. **Category Classifier** + 1. **Responsibility**: Determines whether the current task belongs to the PT or SFT type. + 2. **Input**: Current subtask name. + 3. **LLM Thinking**: Determines the data category based on the task description and generates a dataset background description. + 4. **Output**: `state.category` (`"PT"` or `"SFT"`) and `dataset_background`. + 5. **Fallback Mechanism**: When LLM cannot determine the category, keyword matching is used. SFT keywords include: `["sft", "fine-tuning", "qa", "instruction", "chat", "dialogue"]`. + +### 2.2 Data Collection Phase (Parallel Execution) + +After task analysis is complete, the system enters the `parallel_collection` parallel branch, simultaneously launching two collection pipelines: WebSearch and WebCrawler. + +#### 2.2.1 WebSearch Node + +WebSearch Node is the core data collection node of the system, implementing a complete web exploration and information extraction pipeline with the following core components: + +1. **QueryGenerator** + - **Responsibility**: Generates 3-5 diversified search queries based on the user's original requirement. + - **Example**: Input `"Collect Python code generation datasets"`, output: + - `"Python code generation dataset download"` + - `"Python programming instruction dataset HuggingFace"` + - `"code completion training data GitHub"` + +2. **WebTools** + - **search_web()**: Calls search engines (Tavily / DuckDuckGo / Jina) to obtain the initial URL list. + - **read_with_jina_reader()**: Uses Jina Reader to crawl web page content and return structured Markdown-formatted text. + +3. **Multi-Layer BFS Forest Exploration** + - **Algorithm**: Adopts a Breadth-First Search (BFS) strategy to explore web links layer by layer. In each layer, Jina Reader is used to crawl page content, extract candidate URLs, and then URLSelector filters the most relevant links for the next layer. + - **Key Parameters**: + - `max_depth`: Maximum exploration depth (default 2) + - `concurrent_limit`: Number of concurrent requests (default 10) + - `topk_urls`: Number of URLs filtered per layer (default 5) + - `url_timeout`: Request timeout (default 60 seconds) + +4. **URLSelector** + - **Responsibility**: Uses LLM to select the most relevant URLs from the candidate URL list based on the research objective. + - **Filtering Strategy**: Analyzes URL relevance to the research objective, domain credibility, avoids duplicate content, and filters blocked domains. + +5. **RAGManager** + - **Responsibility**: Stores crawled web content into a vector database, supporting subsequent semantic retrieval and providing context for the SummaryAgent. + +6. **SummaryAgent** + - **Responsibility**: Generates specific download subtasks based on RAG-retrieved content. + - **Output**: A structured subtask list, for example: + ```json + { + "type": "download", + "objective": "Download Spider Text2SQL dataset", + "search_keywords": ["spider dataset", "text2sql"], + "platform_hint": "huggingface", + "priority": 1 + } + ``` + +#### 2.2.2 WebCrawler Node + +WebCrawler Node specializes in extracting code blocks and technical content from web pages. It runs in parallel with WebSearch Node, providing richer data sources. + +1. **Generate Search Queries**: Creates specialized search queries targeting code/technical content. +2. **Search & Crawl**: Searches the web for URL lists and uses Jina Reader for concurrent page crawling. +3. **Code Block Extraction**: Calls `extract_code_blocks_from_markdown` to extract code blocks from Markdown content. +4. **Save Results**: Stores crawled results as `webcrawler_crawled.jsonl`. + +### 2.3 Data Download Phase + +**Download Node** performs the actual dataset download tasks, supporting three download methods with LLM intelligently deciding the download priority order. + +1. **DownloadMethodDecisionAgent (LLM Decision)** + - **Responsibility**: Analyzes the best download method based on the task objective and outputs a priority list, e.g., `["huggingface", "kaggle", "web"]`. + +2. **Try Each Download Method Sequentially**: + - **HuggingFace**: Searches HuggingFace Hub, LLM selects the best matching dataset, and downloads via API. + - **Kaggle**: Searches Kaggle datasets, LLM selects the best match, and downloads through the Kaggle API. + - **Web**: Uses WebAgent for intelligent web exploration and direct file download. + +3. **Record Download Results**: Updates `state.download_results` with the download status and path for each dataset. + +### 2.4 Data Processing & Mapping Phase + +#### Postprocess Node + +- **Responsibility**: Checks whether there are remaining incomplete subtasks (`check_more_tasks`). If so, loops back to the collection phase; otherwise, proceeds to the mapping phase. + +#### Mapping Node + +Mapping Node is responsible for converting collected intermediate-format data into standard Alpaca format, using LLM to generate adaptive Python mapping functions. + +1. **Read Intermediate Data**: Loads raw records from `intermediate.jsonl`. +2. **LLM Generates Mapping Function (Triple Verification)**: + 1. Generates the mapping function 3 times. + 2. Validates consistency on sample data. + 3. Uses the function after passing verification. +3. **Batch Processing**: Executes mapping transformation on all records. +4. **Quality Filtering**: Applies quality filters to remove low-quality data. +5. **Save Results**: Outputs in both `.jsonl` and `.json` formats. + +**Alpaca Format Definition**: + +```json +{ + "instruction": "Task instruction or question", + "input": "Optional input context (e.g., system prompt, SQL Schema)", + "output": "Expected answer or output" +} +``` + +**SFT Data Mapping Rules**: +- `system` role → `input` field +- `user` role → `instruction` field +- `assistant` role → `output` field + +**Mapping Example (Text2SQL)**: + +```json +// Input format +{ + "messages": [ + {"role": "system", "content": "CREATE TABLE farm (Id VARCHAR)"}, + {"role": "user", "content": "How many farms are there?"}, + {"role": "assistant", "content": "SELECT COUNT(*) FROM farm"} + ] +} + +// Output Alpaca format +{ + "instruction": "How many farms are there?", + "input": "CREATE TABLE farm (Id VARCHAR)", + "output": "SELECT COUNT(*) FROM farm" +} +``` + +## 3. State Management & Output + +### 3.1 WebCollectionState Core Fields + +```python +@dataclass +class WebCollectionState(MainState): + # Task related + user_query: str # User's original requirement + task_list: List[Dict] # Decomposed task list + current_task_index: int # Current task index + + # Search related + research_summary: str # Research summary + urls_visited: List[str] # Visited URLs + subtasks: List[Dict] # Download subtasks + + # Download related + download_results: Dict # Download result statistics + + # WebCrawler related + webcrawler_crawled_pages: List # Crawled pages + webcrawler_sft_records: List # SFT records + webcrawler_pt_records: List # PT records + + # Mapping related + mapping_results: Dict # Mapping results + intermediate_data_path: str # Intermediate data path +``` + +### 3.2 WebCollectionRequest Configuration + +```python +@dataclass +class WebCollectionRequest(MainRequest): + # Task configuration + category: str = "PT" # PT or SFT + output_format: str = "alpaca" + + # Search configuration + search_engine: str = "tavily" + max_depth: int = 2 + max_urls: int = 10 + concurrent_limit: int = 5 + topk_urls: int = 5 + + # WebCrawler configuration + enable_webcrawler: bool = True + webcrawler_num_queries: int = 5 + webcrawler_crawl_depth: int = 3 + webcrawler_concurrent_pages: int = 3 +``` + +### 3.3 Output File Structure + +``` +web_collection_output/ +├── rag_db/ # RAG vector database +├── hf_datasets/ # HuggingFace downloaded data +│ └── dataset_name/ +├── kaggle_datasets/ # Kaggle downloaded data +├── web_downloads/ # Direct web downloads +├── webcrawler_output/ # WebCrawler crawled results +│ └── webcrawler_crawled.jsonl +├── processed_output/ # Post-processing results +│ └── intermediate.jsonl +└── mapped_output/ # Final mapping results + ├── final_alpaca_sft.jsonl # Alpaca format (JSONL) + └── final_alpaca_sft.json # Alpaca format (JSON) +``` + +## 4. User Guide + +This feature provides two modes of usage: **Graphical Interface (Gradio UI)** and **Command Line Script**. + +### 4.1 Graphical Interface + +The front-end page code is located in `gradio_app/pages/web_collection.py`, providing a visual interactive experience. To launch the web interface: + +```bash +python gradio_app/app.py +``` + +Visit `http://127.0.0.1:7860` to start using + +![web_agent](/web_agent.png) + +1. `step1:` Describe the type of data you want to collect in the "Target Description" field +2. `step2:` Select the data category (PT or SFT) +3. `step3:` Configure dataset quantity and size limits +4. `step4:` Configure LLM API information (URL, Key, Model) +5. `step5:` (Optional) Configure Kaggle, Tavily, and other service keys +6. `step6:` Click the **"Start Web Collection & Conversion"** button +7. `step7:` Monitor the execution logs in real time +8. `step8:` Review the result summary after completion +9. `step9:` Check the collected data in the download directory + +**Advanced Usage**: Expand the "Advanced Configuration" section to adjust search engine selection, parallelism, caching strategy, data conversion parameters, etc. + +### 4.2 Script Invocation + +For automated tasks or batch collection, it is recommended to use the command line script `script/run_web_collection.py` directly. + +#### 1. Environment Variable Configuration + +```bash +export DF_API_URL="https://api.openai.com/v1" +export DF_API_KEY="your_api_key" +export TAVILY_API_KEY="your_tavily_key" +export KAGGLE_USERNAME="" +export KAGGLE_KEY="" +export RAG_API_URL="" +export RAG_API_KEY="" +``` + +#### 2. Run the Script + +```bash +# Basic usage +python script/run_web_collection.py --target "Collect machine learning Q&A datasets" + +# Full parameters +python script/run_web_collection.py \ + --target "Collect code generation datasets" \ + --category SFT \ + --max-urls 10 \ + --max-depth 2 \ + --download-dir ./my_output +``` + +**Main Parameter Description**: + +- **`--target`**: Data collection target description (required) +- **`--category`**: Data category, `PT` or `SFT` (default `SFT`) +- **`--max-urls`**: Maximum number of URLs (default 10) +- **`--max-depth`**: Maximum crawl depth (default 2) +- **`--output-format`**: Output format (default `alpaca`) + +#### 3. Python API Call + +```python +from dataflow_agent.workflow.wf_web_collection import run_web_collection + +result = await run_web_collection( + target="Collect machine learning code examples", + category="SFT", + output_format="alpaca", + download_dir="./my_output", + model="gpt-4o" +) +``` + +### 4.3 Practical Case: Collecting a Chinese Q&A Dataset + +Suppose we need to build a Chinese Q&A training dataset for a chatbot. Here is the complete workflow. + +**Scenario Configuration:** + +```bash +export DF_API_URL="https://api.openai.com/v1" +export DF_API_KEY="your_api_key" +export TAVILY_API_KEY="your_tavily_key" + +python script/run_web_collection.py \ + --target "Collect Chinese Q&A datasets for fine-tuning" \ + --category SFT \ + --max-urls 20 +``` + +**Run:** +After running the script, the workflow will execute in the following steps: + +1. **Task Decomposition**: LLM decomposes "Collect Chinese Q&A datasets for fine-tuning" into multiple subtasks (e.g., Chinese common knowledge Q&A, Chinese reading comprehension, etc.). +2. **Category Classification**: Based on the "fine-tuning" keyword, automatically classifies as SFT type. +3. **Parallel Collection**: WebSearch explores Chinese QA datasets on platforms such as HuggingFace and GitHub; WebCrawler simultaneously crawls Q&A content from technical blogs. +4. **Intelligent Download**: LLM decides to prioritize downloading matching datasets from HuggingFace, falling back to Kaggle and direct web download on failure. +5. **Format Mapping**: Converts the downloaded heterogeneous data into unified Alpaca format, outputting to the `mapped_output/` directory. + +Users can find the final `final_alpaca_sft.jsonl` file in the download directory, ready for direct use in model fine-tuning training. + +### 4.4 Notes + +1. **API Keys** + - Ensure that necessary API keys are configured + - Tavily is used for search; Kaggle is used for downloading Kaggle datasets + +2. **Network Environment** + - If located in China, it is recommended to use a HuggingFace mirror (set `HF_ENDPOINT`) + - Adjust the parallelism to match your network bandwidth + +3. **Storage Space** + - Ensure sufficient disk space is available + - Large datasets may require several GB of storage + +4. **Execution Time** + - The collection process may take a considerable amount of time (minutes to hours) + - You can control the duration by limiting the number of download tasks + +5. **Data Quality** + - Enabling RAG enhancement can improve data quality + - Adjust sampling parameters to balance quality and speed diff --git a/docs/zh/notes/guide/agent/web_collection.md b/docs/zh/notes/guide/agent/web_collection.md new file mode 100644 index 000000000..995175e8c --- /dev/null +++ b/docs/zh/notes/guide/agent/web_collection.md @@ -0,0 +1,372 @@ +--- +title: Web 数据采集 +createTime: 2026/02/14 00:00:00 +permalink: /zh/guide/agent/web_collection/ +--- + +## 1. 概述 + +**Web Collection Agent** 是 DataFlow-Agent 中的智能数据收集模块,专门用于从互联网自动收集、处理和格式化训练数据集。该系统支持两种数据类型: + +- **PT(Pre-Training,预训练)**:大规模无标注语料,用于模型预训练。 +- **SFT(Supervised Fine-Tuning,监督微调)**:结构化的指令-回答对,用于模型微调。 + +该工作流能够: + +1. **网页搜索与探索**:基于多层 BFS 森林探索策略,由 LLM 驱动 URL 筛选,自动发现和定位目标数据集。 +2. **多平台数据下载**:支持 HuggingFace、Kaggle、Web 直接下载三种方式,LLM 智能决策下载优先顺序。 +3. **双通道并行采集**:WebSearch 和 WebCrawler 两条采集流程并行执行,提供更丰富的数据来源。 +4. **自适应数据映射**:LLM 生成 Python 映射函数,通过三重验证机制,自动将异构数据转换为标准 Alpaca 格式。 + +## 2. 系统架构 + +该功能由 `dataflow_agent/workflow/wf_web_collection.py` 编排,形成一个包含并行分支和条件循环的有向图。整体流程分为四个阶段:任务分析、数据采集(并行)、数据下载、数据处理与映射。 + +### 2.1 任务分析阶段 + +1. **Start Node(初始化节点)** + 1. **职责**: 初始化工作流配置,创建下载目录,准备执行环境。 + 2. **输入**: `state.request.target`(用户原始需求)。 + 3. **输出**: 初始化后的 `user_query` 和下载目录。 + +2. **Task Decomposer(任务分解节点)** + 1. **职责**: 使用 LLM 将复杂的用户需求分解为可执行的子任务,限制最大任务数量(默认 5 个)。 + 2. **输入**: 用户原始查询。 + 3. **LLM 思考**: 分析需求语义,拆分为独立的数据收集子任务。 + 4. **输出**: `state.task_list`,例如: + - 子任务 1:收集 NLP 问答数据集 + - 子任务 2:收集文本分类数据集 + - 子任务 3:收集图像分类数据集 + +3. **Category Classifier(分类节点)** + 1. **职责**: 判断当前任务属于 PT 还是 SFT 类型。 + 2. **输入**: 当前子任务名称。 + 3. **LLM 思考**: 结合任务描述判断数据类别,生成数据集背景描述。 + 4. **输出**: `state.category`(`"PT"` 或 `"SFT"`)以及 `dataset_background`。 + 5. **后备机制**: 当 LLM 无法判断时,使用关键词匹配。SFT 关键词包括:`["sft", "微调", "问答", "qa", "instruction", "fine-tuning"]`。 + +### 2.2 数据采集阶段(并行执行) + +任务分析完成后,系统进入 `parallel_collection` 并行分支,同时启动 WebSearch 和 WebCrawler 两条采集流程。 + +#### 2.2.1 WebSearch Node(网页搜索节点) + +WebSearch Node 是系统的核心数据收集节点,实现了完整的网页探索和信息提取流程,包含以下核心组件: + +1. **QueryGenerator(查询生成器)** + - **职责**: 基于用户原始需求,生成 3-5 个多样化的搜索查询。 + - **示例**: 输入 `"收集 Python 代码生成数据集"`,输出: + - `"Python code generation dataset download"` + - `"Python programming instruction dataset HuggingFace"` + - `"code completion training data GitHub"` + +2. **WebTools(网页工具集)** + - **search_web()**: 调用搜索引擎(Tavily / DuckDuckGo / Jina)获取初始 URL 列表。 + - **read_with_jina_reader()**: 使用 Jina Reader/MinerU-HTML 爬取网页内容,返回结构化的 Markdown 格式文本。 + +3. **多层 BFS 森林探索** + - **算法**: 采用广度优先搜索(BFS)策略,逐层探索网页链接。每一层中,使用 Jina Reader/MinerU-HTML 爬取页面内容,提取候选 URL,再由 URLSelector 筛选最相关的链接进入下一层。 + - **关键参数**: + - `max_depth`: 最大探索深度(默认 2) + - `concurrent_limit`: 并发请求数(默认 10) + - `topk_urls`: 每层筛选的 URL 数量(默认 5) + - `url_timeout`: 请求超时时间(默认 60 秒) + +4. **URLSelector(URL 筛选器)** + - **职责**: 使用 LLM 从候选 URL 列表中选择与研究目标最相关的 URL。 + - **筛选策略**: 分析 URL 与研究目标的相关性、域名可信度,避免重复内容,过滤被阻止的域名。 + +5. **RAGManager(RAG 管理器)** + - **职责**: 将爬取的网页内容存储到向量数据库中,支持后续的语义检索,为 SummaryAgent 提供上下文。 + +6. **SummaryAgent(摘要代理)** + - **职责**: 基于 RAG 检索的内容,生成具体的下载子任务。 + - **输出**: 结构化的子任务列表,例如: + ```json + { + "type": "download", + "objective": "下载 Spider Text2SQL 数据集", + "search_keywords": ["spider dataset", "text2sql"], + "platform_hint": "huggingface", + "priority": 1 + } + ``` + +#### 2.2.2 WebCrawler Node(网页爬虫节点) + +WebCrawler Node 专门用于从网页中提取代码块和技术内容,与 WebSearch Node 并行执行,提供更丰富的数据来源。 + +1. **生成搜索查询**: 针对代码/技术内容生成专用搜索查询。 +2. **搜索与爬取**: 搜索网页获取 URL 列表,使用 Jina Reader 并发爬取页面内容。 +3. **代码块提取**: 调用 `extract_code_blocks_from_markdown` 从 Markdown 内容中提取代码块。 +4. **结果保存**: 将爬取结果保存为 `webcrawler_crawled.jsonl`。 + +### 2.3 数据下载阶段 + +**Download Node(下载节点)** 执行实际的数据集下载任务,支持三种下载方式,并使用 LLM 智能决策下载优先顺序。 + +1. **DownloadMethodDecisionAgent(LLM 决策)** + - **职责**: 根据任务目标分析最佳下载方式,输出优先顺序列表,例如 `["huggingface", "kaggle", "web"]`。 + +2. **依次尝试每种下载方式**: + - **HuggingFace**: 搜索 HuggingFace Hub,LLM 选择最佳匹配数据集,调用 API 下载。 + - **Kaggle**: 搜索 Kaggle 数据集,LLM 选择最佳匹配,通过 Kaggle API 下载。 + - **Web**: 使用 WebAgent 智能探索网页,直接下载文件。 + +3. **记录下载结果**: 更新 `state.download_results`,包含每个数据集的下载状态和路径。 + +### 2.4 数据处理与映射阶段 + +#### Postprocess Node(后处理节点) + +- **职责**: 检查是否还有未完成的子任务(`check_more_tasks`),如果有则循环回到采集阶段;否则进入映射阶段。 + +#### Mapping Node(数据映射节点) + +Mapping Node 负责将收集到的中间格式数据转换为标准的 Alpaca 格式,使用 LLM 生成自适应的 Python 映射函数。 + +1. **读取中间数据**: 加载 `intermediate.jsonl` 中的原始记录。 +2. **LLM 生成映射函数(三重验证)**: + 1. 生成映射函数 3 次。 + 2. 在样本数据上验证一致性。 + 3. 通过验证后使用。 +3. **批量处理**: 对所有记录执行映射转换。 +4. **质量过滤**: 应用质量过滤器剔除低质量数据。 +5. **保存结果**: 输出为 `.jsonl` 和 `.json` 两种格式。 + +**Alpaca 格式定义**: + +```json +{ + "instruction": "任务指令或问题", + "input": "可选的输入上下文(如系统提示、SQL Schema)", + "output": "期望的回答或输出" +} +``` + +**SFT 数据映射规则**: +- `system` 角色 → `input` 字段 +- `user` 角色 → `instruction` 字段 +- `assistant` 角色 → `output` 字段 + +**映射示例(Text2SQL)**: + +```json +// 输入格式 +{ + "messages": [ + {"role": "system", "content": "CREATE TABLE farm (Id VARCHAR)"}, + {"role": "user", "content": "How many farms are there?"}, + {"role": "assistant", "content": "SELECT COUNT(*) FROM farm"} + ] +} + +// 输出 Alpaca 格式 +{ + "instruction": "How many farms are there?", + "input": "CREATE TABLE farm (Id VARCHAR)", + "output": "SELECT COUNT(*) FROM farm" +} +``` + +## 3. 状态管理与输出 + +### 3.1 WebCollectionState 核心字段 + +```python +@dataclass +class WebCollectionState(MainState): + # 任务相关 + user_query: str # 用户原始需求 + task_list: List[Dict] # 分解后的任务列表 + current_task_index: int # 当前任务索引 + + # 搜索相关 + research_summary: str # 调研总结 + urls_visited: List[str] # 已访问 URL + subtasks: List[Dict] # 下载子任务 + + # 下载相关 + download_results: Dict # 下载结果统计 + + # WebCrawler 相关 + webcrawler_crawled_pages: List # 爬取的页面 + webcrawler_sft_records: List # SFT 记录 + webcrawler_pt_records: List # PT 记录 + + # 映射相关 + mapping_results: Dict # 映射结果 + intermediate_data_path: str # 中间数据路径 +``` + +### 3.2 WebCollectionRequest 配置 + +```python +@dataclass +class WebCollectionRequest(MainRequest): + # 任务配置 + category: str = "PT" # PT 或 SFT + output_format: str = "alpaca" + + # 搜索配置 + search_engine: str = "tavily" + max_depth: int = 2 + max_urls: int = 10 + concurrent_limit: int = 5 + topk_urls: int = 5 + + # WebCrawler 配置 + enable_webcrawler: bool = True + webcrawler_num_queries: int = 5 + webcrawler_crawl_depth: int = 3 + webcrawler_concurrent_pages: int = 3 +``` + +### 3.3 输出文件结构 + +``` +web_collection_output/ +├── rag_db/ # RAG 向量数据库 +├── hf_datasets/ # HuggingFace 下载数据 +│ └── dataset_name/ +├── kaggle_datasets/ # Kaggle 下载数据 +├── web_downloads/ # Web 直接下载 +├── webcrawler_output/ # WebCrawler 爬取结果 +│ └── webcrawler_crawled.jsonl +├── processed_output/ # 后处理结果 +│ └── intermediate.jsonl +└── mapped_output/ # 最终映射结果 + ├── final_alpaca_sft.jsonl # Alpaca 格式(JSONL) + └── final_alpaca_sft.json # Alpaca 格式(JSON) +``` + +## 4. 使用指南 + +本功能提供 **图形界面 (Gradio UI)** 和 **命令行脚本** 两种使用方式。 + +### 4.1 图形界面 + +前端页面代码位于 `gradio_app/pages/web_collection.py`,提供了可视化的交互体验。启动 Web 界面: + +```bash +python gradio_app/app.py +``` + +访问 `http://127.0.0.1:7860` 开始使用 + +![web_agent](/web_agent.png) + +1. `step1:` 在"目标描述"中详细说明要收集的数据类型 +2. `step2:` 选择数据类别(PT 或 SFT) +3. `step3:` 配置数据集数量和大小限制 +4. `step4:` 配置 LLM API 信息(URL、Key、模型) +5. `step5:`(可选)配置 Kaggle、Tavily 等服务的密钥 +6. `step6:` 点击 **"开始网页采集与转换"** 按钮 +7. `step7:` 实时查看执行日志 +8. `step8:` 等待完成后查看结果摘要 +9. `step9:` 在下载目录中查看采集的数据 + +**高级使用**:展开"高级配置"区域,可调整搜索引擎选择、并行处理数量、缓存策略、数据转换参数等。 + +### 4.2 脚本调用 + +对于自动化任务或批量采集,推荐直接使用命令行脚本 `script/run_web_collection.py`。 + +#### 1. 环境变量配置 + +```bash +export DF_API_URL="https://api.openai.com/v1" +export DF_API_KEY="your_api_key" +export TAVILY_API_KEY="your_tavily_key" +export KAGGLE_USERNAME="" +export KAGGLE_KEY="" +export RAG_API_URL="" +export RAG_API_KEY="" +``` + +#### 2. 运行脚本 + +```bash +# 基本用法 +python script/run_web_collection.py --target "收集机器学习问答数据集" + +# 完整参数 +python script/run_web_collection.py \ + --target "收集代码生成数据集" \ + --category SFT \ + --max-urls 10 \ + --max-depth 2 \ + --download-dir ./my_output +``` + +**主要参数说明**: + +- **`--target`**: 数据收集目标描述(必填) +- **`--category`**: 数据类别,`PT` 或 `SFT`(默认 `SFT`) +- **`--max-urls`**: 最大 URL 数量(默认 10) +- **`--max-depth`**: 最大爬取深度(默认 2) +- **`--output-format`**: 输出格式(默认 `alpaca`) + +#### 3. Python API 调用 + +```python +from dataflow_agent.workflow.wf_web_collection import run_web_collection + +result = await run_web_collection( + target="收集机器学习代码示例", + category="SFT", + output_format="alpaca", + download_dir="./my_output", + model="gpt-4o" +) +``` + +### 4.3 实战 Case:收集中文问答数据集 + +假设我们需要为聊天机器人构建一份中文问答训练数据集,以下是完整的操作流程。 + +**场景配置:** + +```bash +export DF_API_URL="https://api.openai.com/v1" +export DF_API_KEY="your_api_key" +export TAVILY_API_KEY="your_tavily_key" + +python script/run_web_collection.py \ + --target "收集中文问答数据集用于微调" \ + --category SFT \ + --max-urls 20 +``` + +**运行:** +运行脚本后,工作流会按以下步骤执行: + +1. **任务分解**: LLM 将"收集中文问答数据集用于微调"拆解为多个子任务(如中文常识问答、中文阅读理解等)。 +2. **分类判定**: 根据"微调"关键词,自动判定为 SFT 类型。 +3. **并行采集**: WebSearch 探索 HuggingFace、GitHub 等平台上的中文 QA 数据集;WebCrawler 同步抓取技术博客中的问答内容。 +4. **智能下载**: LLM 决策优先从 HuggingFace 下载匹配数据集,失败后回退到 Kaggle 和 Web 直接下载。 +5. **格式映射**: 将下载的异构数据统一转换为 Alpaca 格式,输出到 `mapped_output/` 目录。 + +用户可以在下载目录下找到最终的 `final_alpaca_sft.jsonl` 文件,直接用于模型微调训练。 + +### 4.4 注意事项 + +1. **API 密钥** + - 确保配置了必要的 API 密钥 + - Tavily 用于搜索,Kaggle 用于下载 Kaggle 数据集 + +2. **网络环境** + - 如果在国内,建议使用 HuggingFace 镜像(设置 `HF_ENDPOINT`) + - 调整并行数量以适应网络带宽 + +3. **存储空间** + - 确保有足够的磁盘空间 + - 大型数据集可能需要数 GB 空间 + +4. **执行时间** + - 采集过程可能需要较长时间(几分钟到几小时) + - 可以通过限制下载任务数量来控制时间 + +5. **数据质量** + - 启用 RAG 增强可以提高数据质量 + - 调整采样参数以平衡质量和速度