From d8a38b6cf065dcabe36353b6a54a400e5c03965a Mon Sep 17 00:00:00 2001 From: wiILIL <975202246@qq.com> Date: Thu, 27 Nov 2025 16:19:29 +0800 Subject: [PATCH] revise the offline_tsds and add offline_near --- docs/.vuepress/notes/en/guide.ts | 3 +- docs/.vuepress/notes/zh/guide.ts | 3 +- .../guide/selector/selector_offline_near.md | 201 ++++++++++++++ .../guide/selector/selector_offline_tsds.md | 236 ++++++++++++++++ .../guide/selector/selector_offline_near.md | 218 +++++++++++++++ .../guide/selector/selector_offline_tsds.md | 254 ++++++++++++++++++ 6 files changed, 913 insertions(+), 2 deletions(-) create mode 100644 docs/en/notes/guide/selector/selector_offline_near.md create mode 100644 docs/en/notes/guide/selector/selector_offline_tsds.md create mode 100644 docs/zh/notes/guide/selector/selector_offline_near.md create mode 100644 docs/zh/notes/guide/selector/selector_offline_tsds.md diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts index c4cff1b..ce20914 100644 --- a/docs/.vuepress/notes/en/guide.ts +++ b/docs/.vuepress/notes/en/guide.ts @@ -25,7 +25,8 @@ export const Guide: ThemeNote = defineNoteConfig({ 'quickstart', 'tutorial', 'selector_less', - 'selector_tsds', + 'selector_offline_tsds', + 'selector_offline_near', 'selector_zeroth' ], }, diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts index 0b86798..5f56c23 100644 --- a/docs/.vuepress/notes/zh/guide.ts +++ b/docs/.vuepress/notes/zh/guide.ts @@ -25,7 +25,8 @@ export const Guide: ThemeNote = defineNoteConfig({ 'quickstart', 'tutorial', 'selector_less', - 'selector_tsds', + 'selector_offline_tsds', + 'selector_offline_near', 'selector_zeroth', ], }, diff --git a/docs/en/notes/guide/selector/selector_offline_near.md b/docs/en/notes/guide/selector/selector_offline_near.md new file mode 100644 index 0000000..9180035 --- /dev/null +++ b/docs/en/notes/guide/selector/selector_offline_near.md @@ -0,0 +1,201 @@ +--- +title: Offline-Near-Selector +createTime: 2025/11/27 16:02:41 +permalink: /en/guide/7k0w3d92/ +icon: flowbite:fish-alt-outline +--- +# Offline NEAR Selector + +This document introduces how to use the **Offline NEAR Selector** for **dynamic data selection** during supervised fine-tuning (SFT) within the **DataFlex** framework, finding the most close data to the target dataset to improve generalization performance. + +--- + +## 1. Method Overview + +The core idea of **NEAR** is: + +* Further encode **already tokenized** samples into **sentence embeddings** (e.g., 512‑dim). +* Perform **nearest‑neighbor search ** in the embedding space to obtain each sample’s representativeness score. + +> Intuition: **Closest data for the target dataset** + +### Scoring Formulation + +Let the sentence embedding of a sample be $e_i$, and let its $max_K$ nearest neighbors be $\mathcal{N}_K(i)$. + + + +--- + +## 2. Environment & Dependencies + +```bash +# DataFlex (recommended: editable install) +git clone https://github.com/OpenDCAI/DataFlex.git +cd DataFlex +pip install -e . + +# Common training/inference dependencies (as needed) +pip install llamafactory + +# NEAR extras (vector search & progress bars) +pip install faiss-cpu vllm sentence-transformer +``` + +--- + +## 3. Offline Selection + +Modify training set, embedding model, and parameters inside +**DataFlex/src/dataflex/offline_selector/offline_near_selector.py**: +```python +if __name__ == "__main__": + near = offline_near_Selector( + candidate_path="OpenDCAI/DataFlex-selector-openhermes-10w", # split = train + query_path="OpenDCAI/DataFlex-selector-openhermes-10w", # split = vaildation + + # If you want to use vllm,please add "vllm:" before model's name + # Otherwise it automatically use sentence-transfromer + embed_model="vllm:Qwen/Qwen3-Embedding-0.6B", + batch_size=32, + save_indices_path="top_indices.npy", + max_K=1000, + + ) + near.selector() +``` + +Note: model_name is used to encode the already-tokenized text into sentence embeddings (e.g., 512-dim), supporting both vLLM and sentence-transformer inference. + +Output: save as the indices matrix that contain the max_K close data for each query +--- + +## 4. Key Hyperparameters & Tips + +| Parameter | Typical Range | Meaning & Tips | +| ------------- | ------------- | --------------------------------------------------------------------------------------------- | +| `max_K` | 64–10000 | Upper bound of NN retrieval. Larger = stabler but more costly; balance with data size & VRAM. | | +| `model_name` | — | Path/name of the sentence encoder (local BERT/USE/SimCSE, etc.). | +| `cache_dir` | — | Cache directory for intermediate artifacts and resume‑from‑cache. | + +--- + +## 5. Component Config (`components.yaml`) + +**Path:** `DataFlex/src/dataflex/configs/components.yaml` + +**Preset example** + +```yaml +near: + name: near + params: + indices_path: ./src/dataflex/offline_selector/top_indices.npy + cache_dir: ../dataflex_saves/near_output + +``` + +--- + +## 6. Dynamic Training Config (LoRA + NEAR) + +**Example file:** `DataFlex/examples/train_lora/selectors/near.yaml` + +```yaml +### model +model_name_or_path: +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_target: all +lora_rank: 16 +lora_alpha: 8 + +### dataset +dataset: # training dataset +template: qwen +cutoff_len: 4096 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: ../dataflex_saves +logging_steps: 10 +save_steps: 100 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 2 +gradient_accumulation_steps: 16 +learning_rate: 1.0e-4 +num_train_epochs: 1.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true + +### Dataflex args +train_type: dynamic_select +components_cfg_file: src/dataflex/configs/components.yaml +component_name: near +warmup_step: 400 +update_step: 500 +update_times: 2 + +``` + +**Notes:** + +* `component_name: near` enables the NEAR component. +* `warmup_step / update_step / update_times` decide **when** and **how often** to re‑select the training subset; total steps ≈ `warmup_step + update_step × update_times`. +* total batch_size=device_number x per_device_train_batch_size x gradient_accumulation_steps + +--- + +## 7. Run Training + +```bash +FORCE_TORCHRUN=1 DISABLE_VERSION_CHECK=1 dataflex-cli train examples/train_lora/selectors/near.yaml +``` + +**Note:** the above example runs with distributed launch. + +During training, NEAR is triggered at scheduled steps: base the sample indice → select the next training subset. + +--- + +## 8. Merge & Export the Model + +Same as the Less Selector pipeline. + +**Config file:** `DataFlex/examples/merge_lora/llama3_lora_sft.yaml` + +```yaml +model_name_or_path: base model path +adapter_name_or_path: finetuned adapter path +template: qwen +trust_remote_code: true + +export_dir: ../dataflex_saves +export_size: 5 +export_device: cpu +export_legacy_format: false + +``` + +Run the export command (inside the LLaMA‑Factory directory): + +```bash +llamafactory-cli export llama3_lora_sft.yaml +``` + +--- + +## 9. Evaluation & Comparison + +We recommend using the [DataFlow](https://github.com/OpenDCAI/DataFlow) QA evaluation pipeline to compare **NEAR** against **Less** and **random sampling**. + + diff --git a/docs/en/notes/guide/selector/selector_offline_tsds.md b/docs/en/notes/guide/selector/selector_offline_tsds.md new file mode 100644 index 0000000..bee1be4 --- /dev/null +++ b/docs/en/notes/guide/selector/selector_offline_tsds.md @@ -0,0 +1,236 @@ +--- +title: Offline-Tsds-Selector +createTime: 2025/11/01 21:36:21 +permalink: /en/guide/im5q9cd2/ +icon: tdesign:cat +--- + + +# Offline TSDS Selector + +This document introduces how to use the **Offline TSDS Selector** for **dynamic data selection** during supervised fine-tuning (SFT) within the **DataFlex** framework, achieving a balance between **density representativeness** and **diversity** to improve generalization performance. + +--- + +## 1. Method Overview + +The core idea of **TSDS** is: + +* Further encode **already tokenized** samples into **sentence embeddings** (e.g., 512‑dim). +* Perform **nearest‑neighbor search & kernel density estimation (KDE)** in the embedding space to obtain each sample’s representativeness score. +* Incorporate **topological diversity** (avoid only picking clusters), and trade off density vs. diversity via the coefficient `alpha`. + +> Intuition: **Higher density** ⇒ more “typical/representative” samples; **higher diversity** ⇒ broader coverage and less redundancy. + +### Scoring Formulation + +Let the sentence embedding of a sample be $e_i$, and let its $K$ nearest neighbors be $\mathcal{N}_K(i)$. + + +1. **Kernel Density Estimation (KDE):** + $$ + \text{density}(i) + = \frac{1}{K} \sum_{j\in \mathcal{N}_K(i)} + \exp!\left(-\frac{\lVert e_i - e_j \rVert^2}{2\sigma^2}\right) + $$ + +2. **Diversity (simple implementation via de‑dup penalty / marginal gain):** + $$ + \text{diversity}(i)\ \propto + \min_{j\in S} \lVert e_i - e_j \rVert,\quad + S=\text{selected set} + $$ + +3. **Combined Score:** + $$ + \text{score}(i) + = \alpha, \text{density}(i) + + * (1-\alpha), \text{diversity}(i) + $$ + +> In practice, `kde_K` (neighbors used by KDE) and `max_K` (overall NN search limit) can differ. `C` can be used as a selection ratio/threshold or other control term depending on the implementation. + +--- + +## 2. Environment & Dependencies + +```bash +# DataFlex (recommended: editable install) +git clone https://github.com/OpenDCAI/DataFlex.git +cd DataFlex +pip install -e . + +# Common training/inference dependencies (as needed) +pip install llamafactory + +# TSDS extras (vector search & progress bars) +pip install faiss-cpu vllm sentence-transformer +``` + +--- + +## 3. Offline Selection + +Modify training set, embedding model, and parameters inside +**DataFlex/src/dataflex/offline_selector/offline_tsds_selector.py**: +```python +if __name__ == "__main__": + tsds = offline_tsds_Selector( + candidate_path="OpenDCAI/DataFlex-selector-openhermes-10w", # training set + query_path="OpenDCAI/DataFlex-selector-openhermes-10w", # validation set + + # If you want to use vllm, please add "vllm:" before the model name + # Otherwise it automatically uses sentence-transformer + embed_model="vllm:Qwen/Qwen3-Embedding-0.6B", # embedding model + batch_size=32, + save_probs_path="tsds_probs.npy", + max_K=5000, + kde_K=1000, + sigma=0.75, + alpha=0.6, + C=5.0 + ) + tsds.selector() +``` + +Note: model_name is used to encode the already-tokenized text into sentence embeddings (e.g., 512-dim), supporting both vLLM and sentence-transformer inference. + +Output: a sampling probability for each training sample. +--- + +## 4. Key Hyperparameters & Tips + +| Parameter | Typical Range | Meaning & Tips | +| ------------- | ------------- | --------------------------------------------------------------------------------------------- | +| `max_K` | 64–10000 | Upper bound of NN retrieval. Larger = stabler but more costly; balance with data size & VRAM. | +| `kde_K` | 16–2000 | #neighbors in KDE. Smaller = more sensitive; larger = smoother. Usually `kde_K ≤ max_K`. | +| `sigma` | 0.5–2.0 | KDE bandwidth. Too small ⇒ noisy; too large ⇒ oversmoothing. | +| `alpha` | 0.3–0.7 | Trade‑off between representativeness (density) and coverage (diversity). | +| `C` | 0.01–1.0 | Selection ratio/threshold or regularization strength depending on implementation. | +| `sample_size` | 500–5000 | Candidate pool size per selection step; heavily impacts speed & quality. | +| `model_name` | — | Path/name of the sentence encoder (local BERT/USE/SimCSE, etc.). | +| `cache_dir` | — | Cache directory for intermediate artifacts and resume‑from‑cache. | + +--- + +## 5. Component Config (`components.yaml`) + +**Path:** `DataFlex/src/dataflex/configs/components.yaml` + +**Preset example** + +```yaml +tsds: + name: tsds + params: + probs_path: ./src/dataflex/offline_selector/tsds_probs.npy + # default location of tsds_probs.npy + cache_dir: ../dataflex_saves/tsds_output + +``` + +--- + +## 6. Dynamic Training Config (LoRA + TSDS) + +**Example file:** `DataFlex/examples/train_lora/selectors/tsds.yaml` + +```yaml +### model +model_name_or_path: +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_target: all +lora_rank: 16 +lora_alpha: 8 + +### dataset +dataset: # training dataset +template: qwen +cutoff_len: 4096 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: ../dataflex_saves +logging_steps: 10 +save_steps: 100 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 2 +gradient_accumulation_steps: 16 +learning_rate: 1.0e-4 +num_train_epochs: 1.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true + +### Dataflex args +train_type: dynamic_select +components_cfg_file: src/dataflex/configs/components.yaml +component_name: tsds +warmup_step: 400 +update_step: 500 +update_times: 2 + +``` + +**Notes:** + +* `component_name: tsds` enables the TSDS component. +* `warmup_step / update_step / update_times` decide **when** and **how often** to re‑select the training subset; total steps ≈ `warmup_step + update_step × update_times`. +* total batch_size=device_number x per_device_train_batch_size x gradient_accumulation_steps + +--- + +## 7. Run Training + +```bash +FORCE_TORCHRUN=1 DISABLE_VERSION_CHECK=1 dataflex-cli train examples/train_lora/selectors/tsds.yaml +``` + +**Note:** the above example runs with distributed launch. + +During training, TSDS is triggered at scheduled steps: base the sample probablity → select the next training subset. + +--- + +## 8. Merge & Export the Model + +Same as the Less Selector pipeline. + +**Config file:** `DataFlex/examples/merge_lora/llama3_lora_sft.yaml` + +```yaml +model_name_or_path: base model path +adapter_name_or_path: finetuned adapter path +template: qwen +trust_remote_code: true + +export_dir: ../dataflex_saves +export_size: 5 +export_device: cpu +export_legacy_format: false + +``` + +Run the export command (inside the LLaMA‑Factory directory): + +```bash +llamafactory-cli export llama3_lora_sft.yaml +``` + +--- + +## 9. Evaluation & Comparison + +We recommend using the [DataFlow](https://github.com/OpenDCAI/DataFlow) QA evaluation pipeline to compare **TSDS** against **Less** and **random sampling**. + + diff --git a/docs/zh/notes/guide/selector/selector_offline_near.md b/docs/zh/notes/guide/selector/selector_offline_near.md new file mode 100644 index 0000000..8172975 --- /dev/null +++ b/docs/zh/notes/guide/selector/selector_offline_near.md @@ -0,0 +1,218 @@ +--- +title: Offline-Near数据选择器 +createTime: 2025/11/26 23:42:41 +permalink: /zh/guide/acgesu99/ +icon: flowbite:fish-alt-outline +--- +# Offline NEAR Selector 使用介绍 + +本文档介绍如何在 **DataFlex** 框架中使用 **Offline NEAR Selector** 实现训练数据的**动态选择**,以在监督微调(SFT)中聚焦于与目标集的相似度,进行邻近选择。 + +--- + +## 1. 方法概述 + +**NEAR** 的核心思想是: + +* 先将**已分词(tokenized)**的样本进一步编码为**句向量**(例如 512 维)。 +* 在嵌入空间中进行**近邻搜索 **,得到每个样本与目标集的“样本相似度”。 + + +> 直观理解:选择与目标集最接近的训练数据,以最优化训练目标。 + + +--- + +## 2. 环境与依赖 + +```bash +# DataFlex(建议源码安装) +git clone https://github.com/OpenDCAI/DataFlex.git +cd DataFlex +pip install -e . + +# 训练与推理的常用依赖 +pip install llamafactory==0.9.3 + +# NEAR 额外依赖(向量检索与进度条等) +pip install faiss-cpu vllm sentence-transformer +``` + +--- + +## 3. offline 数据选择 + +在DataFlex\src\dataflex\offline_selector\offline_near_selector.py文件中修改训练集、编码模型和参数 +```python +if __name__ == "__main__": + near = offline_near_Selector( + candidate_path="OpenDCAI/DataFlex-selector-openhermes-10w", # split = train + query_path="OpenDCAI/DataFlex-selector-openhermes-10w", # split = vaildation + + # If you want to use vllm,please add "vllm:" before model's name + # Otherwise it automatically use sentence-transfromer + embed_model="vllm:Qwen/Qwen3-Embedding-0.6B", + batch_size=32, + save_indices_path="top_indices.npy", + max_K=1000, + + ) + near.selector() + +``` + +> **注意**:此处的 `model_name` 用于将**tokenized**后的文本进一步编码为**句向量**(例如 512 维),支持vllm和sentence-transformer 推理。 + +**最终保存为每个query的max_K个最邻近训练数据的索引矩阵 ( N ,max_K )** + +--- + +## 4. 关键超参数与建议 + +| 参数 | 典型范围 | 含义与建议 | +| ------------- | -------- | ----------------------------------------- | +| `max_K` | 64–2000 | 近邻检索数量上限,越大越稳但开销更高;建议与数据规模/显存权衡 | | +| `model_name` | — | 句向量编码模型路径或名称(如本地embeddingm模型) | +| `cache_dir` | — | 中间结果缓存路径,便于断点续跑 | + +--- + +## 5. 组件配置(components.yaml) + +**路径:** `DataFlex/src/dataflex/configs/components.yaml` + +**预设参数** + +```yaml +near: + name: near + params: + indices_path: ./src/dataflex/offline_selector/top_indices.npy + cache_dir: ../dataflex_saves/near_output + +``` + +--- + +## 6. 动态训练配置(LoRA + NEAR) + +**示例文件:** `DataFlex/examples/train_lora/selectors/near.yaml` + +```yaml +### model +model_name_or_path: #模型地址 +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_target: all +lora_rank: 16 +lora_alpha: 8 +# deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json] + +### dataset +dataset: #训练集 +template: qwen (训练模型类型:qwen、llama...) +cutoff_len: 4096 +# max_samples: 100000000 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 0 +# disable_shuffling: true +seed: 42 + +### output +output_dir: ../dataflex_saves +logging_steps: 10 +save_steps: 100 +plot_loss: true +save_only_model: false +overwrite_output_dir: true + +### swanlab +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] +# use_swanlab: true +# swanlab_project: medical_dynamic_sft +# swanlab_run_name: qwen2_5_3b_lora_medical_50k_baseline +# swanlab_workspace: word2li +# swanlab_api_key: AnLWTMijcbd4cyEfundi3 +# swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/ff10a391-4e51-4481-97ff-965760cae2a1 +# swanlab_lark_secret: cySzwTbCJh08349FGAhBSf + +### train +per_device_train_batch_size: 2 +gradient_accumulation_steps: 16 +learning_rate: 1.0e-4 +num_train_epochs: 1.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: false + +### Dataflex args +train_type: dynamic_select # 选择训练器类型。可选值包括: + # "dynamic_select" - 动态选择训练器 + # "dynamic_mix" - 动态混合训练器 + # "dynamic_weight" - 动态加权训练器 + # "static" - 默认静态训练器 +components_cfg_file: src/dataflex/configs/components.yaml +component_name: near # 选择组件名称,对应 components_cfg_file 中定义的组件 +warmup_step: 400 +update_step: 500 +update_times: 2 +# eval_dataset: alpaca_zh_demo + + +``` + +**参数说明:** + +* `component_name: near`:启用 NEAR 组件。 +* `warmup_step / update_step / update_times`:决定**何时**与**多久**进行一次动态选择;总步数 ≈ `warmup_step + update_step × update_times`。 +* 总batch_size=device_number x per_device_train_batch_size x gradient_accumulation_steps + + +--- + +## 7. 运行训练 + +```bash +FORCE_TORCHRUN=1 DISABLE_VERSION_CHECK=1 dataflex-cli train examples/train_lora/selectors/near.yaml +``` +**采用分布式** + +训练过程中会在设定的步数触发 NEAR 动态选择:根据离线选择的样本索引,选出下一阶段训练子集。 + +--- + +## 8. 模型合并与导出 + +与 Less Selector 流程一致: + +**配置文件:** `DataFlex/examples/merge_lora/llama3_lora_sft.yaml` + +```yaml +model_name_or_path: 原模型地址 +adapter_name_or_path: 微调后adpter地址 +template: qwen +trust_remote_code: true + +export_dir: ../dataflex_saves +export_size: 5 +export_device: cpu +export_legacy_format: false +``` + +导出命令: +在llamafactory文件夹中运行 +```bash +llamafactory-cli export llama3_lora_sft.yaml +``` + +--- + +## 9. 评估与对比 + +建议使用 [DataFlow](https://github.com/OpenDCAI/DataFlow) 的模型 QA 评估流水线,对 **NEAR** 与 **Less**、**随机采样** 等策略进行并列评测 \ No newline at end of file diff --git a/docs/zh/notes/guide/selector/selector_offline_tsds.md b/docs/zh/notes/guide/selector/selector_offline_tsds.md new file mode 100644 index 0000000..5f3ffdc --- /dev/null +++ b/docs/zh/notes/guide/selector/selector_offline_tsds.md @@ -0,0 +1,254 @@ +--- +title: Offline-Tsds 数据选择器 +createTime: 2025/11/01 21:35:45 +permalink: /zh/guide/vkqfowej/ +icon: tdesign:cat + +--- + + +# Offline TSDS Selector 使用介绍 + +本文档介绍如何在 **DataFlex** 框架中使用 **Offline TSDS Selector** Data Selection for Task-Specific Model Finetuning实现训练数据的**动态选择**,以在监督微调(SFT)中兼顾**密度代表性**与**多样性**,提升泛化效果。 + +--- + +## 1. 方法概述 + +**TSDS** 的核心思想是: + +* 先将**已分词(tokenized)**的样本进一步编码为**句向量**(例如 512 维)。 +* 在嵌入空间中进行**近邻搜索 & 密度估计(KDE)**,得到每个样本的“代表性分数”。 +* 同时考虑**拓扑多样性**(避免只挑“挤在一起”的样本),在密度与多样性之间用系数 `alpha` 做权衡。 + +> 直观理解:密度高 = 更“典型/代表”的数据, +> 多样性高 = 覆盖面更广、减少信息冗余。 + +### 评分构成 + +设样本的句向量为 $e_i$,其 $K$ 个近邻集合为 $\mathcal{N}_K(i)$。 + +1. **核密度估计(KDE)**: +$$ +\text{density}(i) += \frac{1}{K} \sum_{j\in \mathcal{N}_K(i)} +\exp\!\left(-\frac{\lVert e_i - e_j \rVert^2}{2\sigma^2}\right) +$$ + +2. **多样性(简单实现可用去冗余惩罚/边际增益)**: +$$ +\text{diversity}(i)\ \propto\ +\min_{j\in S} \lVert e_i - e_j \rVert,\quad +S=\text{已选集合} +$$ + +3. **综合评分**: +$$ +\text{score}(i) += \alpha\, \text{density}(i) ++ (1-\alpha)\, \text{diversity}(i) +$$ + +> 实际实现中,`kde_K`(用于密度估计的近邻数)与 `max_K`(总检索近邻上限)可不同;`C` 可作为筛选比例/阈值等控制量。 + +--- + +## 2. 环境与依赖 + +```bash +# DataFlex(建议源码安装) +git clone https://github.com/OpenDCAI/DataFlex.git +cd DataFlex +pip install -e . + +# 训练与推理的常用依赖 +pip install llamafactory==0.9.3 + +# TSDS 额外依赖(向量检索与进度条等) +pip install faiss-cpu vllm sentence-transformer +``` + +--- + +## 3. offline 数据选择 + +在DataFlex\src\dataflex\offline_selector\offline_tsds_selector.py文件中修改训练集、编码模型和参数 +```python +if __name__ == "__main__": + tsds = offline_tsds_Selector( + candidate_path="OpenDCAI/DataFlex-selector-openhermes-10w",#训练集 + query_path="OpenDCAI/DataFlex-selector-openhermes-10w",#验证集 + + # If you want to use vllm,please add "vllm:" before model's name + # Otherwise it automatically use sentence-transfromer + embed_model="vllm:Qwen/Qwen3-Embedding-0.6B",#编码模型 + batch_size=32, + save_probs_path="tsds_probs.npy", + max_K=5000, + kde_K=1000, + sigma=0.75, + alpha=0.6, + C=5.0 + ) + tsds.selector() + +``` + +> **注意**:此处的 `model_name` 用于将**tokenized**后的文本进一步编码为**句向量**(例如 512 维),支持vllm和sentence-transformer 推理。 + +**最终保存为每个训练样本的采样概率** + +--- + +## 4. 关键超参数与建议 + +| 参数 | 典型范围 | 含义与建议 | +| ------------- | -------- | ----------------------------------------- | +| `max_K` | 64-2000 | 近邻检索数量上限,越大越稳但开销更高;建议与数据规模/显存权衡 | +| `kde_K` | 16–10000 | 用于密度估计的邻居数,越小更敏感、越大更平滑;通常 `kde_K ≤ max_K` | +| `sigma` | 0.5–2.0 | KDE 的核宽度,过小噪声大,过大易过平滑 | +| `alpha` | 0.3–0.7 | 密度 vs 多样性的权衡系数,靠 1 偏重代表性,靠 0 偏重覆盖度 | +| `C` | 0.01–1.0 | 用作筛选比例/阈值/正则系数等控制量;与实现细节相关 | +| `sample_size` | 500–5000 | 每次候选评估的样本数上限;大幅影响速度与效果 | +| `model_name` | — | 句向量编码模型路径或名称(如本地embedding模型) | +| `cache_dir` | — | 中间结果缓存路径,便于断点续跑 | + +--- + +## 5. 组件配置(components.yaml) + +**路径:** `DataFlex/src/dataflex/configs/components.yaml` + +**预设参数** + +```yaml +tsds: + name: tsds + params: + probs_path: ./src/dataflex/offline_selector/tsds_probs.npy + #默认离线数据选择所在位置的tsds_probs.npy文件 + cache_dir: ../dataflex_saves/tsds_output +``` + +--- + +## 6. 动态训练配置(LoRA + TSDS) + +**示例文件:** `DataFlex/examples/train_lora/selectors/tsds.yaml` + +```yaml +### model +model_name_or_path: +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_target: all +lora_rank: 16 +lora_alpha: 8 +# deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json] + +### dataset +dataset: #训练集 +template: qwen (训练模型类型:qwen、llama...) +cutoff_len: 4096 +# max_samples: 100000000 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 0 +# disable_shuffling: true +seed: 42 + +### output +output_dir: ../dataflex_saves +logging_steps: 10 +save_steps: 100 +plot_loss: true +save_only_model: false +overwrite_output_dir: true + +### swanlab +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] +# use_swanlab: true +# swanlab_project: medical_dynamic_sft +# swanlab_run_name: qwen2_5_3b_lora_medical_50k_baseline +# swanlab_workspace: word2li +# swanlab_api_key: AnLWTMijcbd4cyEfundi3 +# swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/ff10a391-4e51-4481-97ff-965760cae2a1 +# swanlab_lark_secret: cySzwTbCJh08349FGAhBSf + +### train +per_device_train_batch_size: 2 +gradient_accumulation_steps: 16 +learning_rate: 1.0e-4 +num_train_epochs: 1.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: false + +### Dataflex args +train_type: dynamic_select # 选择训练器类型。可选值包括: + # "dynamic_select" - 动态选择训练器 + # "dynamic_mix" - 动态混合训练器 + # "dynamic_weight" - 动态加权训练器 + # "static" - 默认静态训练器 +components_cfg_file: src/dataflex/configs/components.yaml +component_name: tsds # 选择组件名称,对应 components_cfg_file 中定义的组件 +warmup_step: 400 +update_step: 500 +update_times: 2 +# eval_dataset: alpaca_zh_demo + +``` + +**参数说明:** + +* `component_name: tsds`:启用 TSDS 组件。 +* `warmup_step / update_step / update_times`:决定**何时**与**多久**进行一次动态选择;总步数 ≈ `warmup_step + update_step × update_times`。 +* 总batch_size=device_number x per_device_train_batch_size x gradient_accumulation_steps + +--- + +## 7. 运行训练 + +```bash +FORCE_TORCHRUN=1 DISABLE_VERSION_CHECK=1 dataflex-cli train examples/train_lora/selectors/tsds.yaml +``` +**采用分布式** + +训练过程中会在设定的步数触发 TSDS 动态选择:根据离线选择的样本采样概率,选出下一阶段训练子集。 + +--- + +## 8. 模型合并与导出 + +与 Less Selector 流程一致: + +**配置文件:** `DataFlex/examples/merge_lora/llama3_lora_sft.yaml` + +```yaml +model_name_or_path: 原模型地址 +adapter_name_or_path: 微调后adpter地址 +template: qwen +trust_remote_code: true + +export_dir: ../dataflex_saves +export_size: 5 +export_device: cpu +export_legacy_format: false +``` + +导出命令: +在llamafactory文件夹中运行 +```bash +llamafactory-cli export llama3_lora_sft.yaml +``` + +--- + +## 9. 评估与对比 + +建议使用 [DataFlow](https://github.com/OpenDCAI/DataFlow) 的模型 QA 评估流水线,对 **TSDS** 与 **Less**、**随机采样** 等策略进行并列评测 \ No newline at end of file