From 2b683a430ec8ae94658736bc940dd62dfd95b58b Mon Sep 17 00:00:00 2001 From: wiILIL <975202246@qq.com> Date: Mon, 22 Dec 2025 22:57:45 +0800 Subject: [PATCH] revise the embed method --- .../guide/selector/selector_offline_near.md | 13 ++++++++----- .../guide/selector/selector_offline_tsds.md | 16 +++++++++------- .../guide/selector/selector_offline_near.md | 13 ++++++++----- .../guide/selector/selector_offline_tsds.md | 16 +++++++++------- 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/docs/en/notes/guide/selector/selector_offline_near.md b/docs/en/notes/guide/selector/selector_offline_near.md index 9180035..8173cd8 100644 --- a/docs/en/notes/guide/selector/selector_offline_near.md +++ b/docs/en/notes/guide/selector/selector_offline_near.md @@ -53,10 +53,13 @@ if __name__ == "__main__": near = offline_near_Selector( candidate_path="OpenDCAI/DataFlex-selector-openhermes-10w", # split = train query_path="OpenDCAI/DataFlex-selector-openhermes-10w", # split = vaildation - - # If you want to use vllm,please add "vllm:" before model's name - # Otherwise it automatically use sentence-transfromer - embed_model="vllm:Qwen/Qwen3-Embedding-0.6B", + # It automatically try vllm first, then sentence-transformers + embed_model="Qwen/Qwen3-Embedding-0.6B", + # support method: + #auto(It automatically try vllm first, then sentence-transformers), + #vllm, + #sentence-transformer + embed_method= "auto", batch_size=32, save_indices_path="top_indices.npy", max_K=1000, @@ -65,7 +68,7 @@ if __name__ == "__main__": near.selector() ``` -Note: model_name is used to encode the already-tokenized text into sentence embeddings (e.g., 512-dim), supporting both vLLM and sentence-transformer inference. +Note: model_name is used to encode the already-tokenized text into sentence embeddings (e.g., 1024-dim), supporting both vLLM and sentence-transformer inference. Output: save as the indices matrix that contain the max_K close data for each query --- diff --git a/docs/en/notes/guide/selector/selector_offline_tsds.md b/docs/en/notes/guide/selector/selector_offline_tsds.md index bee1be4..3e3af90 100644 --- a/docs/en/notes/guide/selector/selector_offline_tsds.md +++ b/docs/en/notes/guide/selector/selector_offline_tsds.md @@ -77,12 +77,14 @@ Modify training set, embedding model, and parameters inside ```python if __name__ == "__main__": tsds = offline_tsds_Selector( - candidate_path="OpenDCAI/DataFlex-selector-openhermes-10w", # training set - query_path="OpenDCAI/DataFlex-selector-openhermes-10w", # validation set - - # If you want to use vllm, please add "vllm:" before the model name - # Otherwise it automatically uses sentence-transformer - embed_model="vllm:Qwen/Qwen3-Embedding-0.6B", # embedding model + candidate_path="OpenDCAI/DataFlex-selector-openhermes-10w", + query_path="OpenDCAI/DataFlex-selector-openhermes-10w", + embed_model="Qwen/Qwen3-Embedding-0.6B", + # support method: + #auto(It automatically try vllm first, then sentence-transformers), + #vllm, + #sentence-transformer + embed_method="auto", batch_size=32, save_probs_path="tsds_probs.npy", max_K=5000, @@ -94,7 +96,7 @@ if __name__ == "__main__": tsds.selector() ``` -Note: model_name is used to encode the already-tokenized text into sentence embeddings (e.g., 512-dim), supporting both vLLM and sentence-transformer inference. +Note: model_name is used to encode the already-tokenized text into sentence embeddings (e.g., 1024-dim), supporting both vLLM and sentence-transformer inference. Output: a sampling probability for each training sample. --- diff --git a/docs/zh/notes/guide/selector/selector_offline_near.md b/docs/zh/notes/guide/selector/selector_offline_near.md index 8172975..8fb89f9 100644 --- a/docs/zh/notes/guide/selector/selector_offline_near.md +++ b/docs/zh/notes/guide/selector/selector_offline_near.md @@ -48,10 +48,13 @@ if __name__ == "__main__": near = offline_near_Selector( candidate_path="OpenDCAI/DataFlex-selector-openhermes-10w", # split = train query_path="OpenDCAI/DataFlex-selector-openhermes-10w", # split = vaildation - - # If you want to use vllm,please add "vllm:" before model's name - # Otherwise it automatically use sentence-transfromer - embed_model="vllm:Qwen/Qwen3-Embedding-0.6B", + # It automatically try vllm first, then sentence-transformers + embed_model="Qwen/Qwen3-Embedding-0.6B", + # support method: + #auto(It automatically try vllm first, then sentence-transformers), + #vllm, + #sentence-transformer + embed_method= "auto", batch_size=32, save_indices_path="top_indices.npy", max_K=1000, @@ -61,7 +64,7 @@ if __name__ == "__main__": ``` -> **注意**:此处的 `model_name` 用于将**tokenized**后的文本进一步编码为**句向量**(例如 512 维),支持vllm和sentence-transformer 推理。 +> **注意**:此处的 `model_name` 用于将**tokenized**后的文本进一步编码为**句向量**(例如 1024 维),支持vllm和sentence-transformer 推理。 **最终保存为每个query的max_K个最邻近训练数据的索引矩阵 ( N ,max_K )** diff --git a/docs/zh/notes/guide/selector/selector_offline_tsds.md b/docs/zh/notes/guide/selector/selector_offline_tsds.md index 5f3ffdc..b255994 100644 --- a/docs/zh/notes/guide/selector/selector_offline_tsds.md +++ b/docs/zh/notes/guide/selector/selector_offline_tsds.md @@ -76,12 +76,14 @@ pip install faiss-cpu vllm sentence-transformer ```python if __name__ == "__main__": tsds = offline_tsds_Selector( - candidate_path="OpenDCAI/DataFlex-selector-openhermes-10w",#训练集 - query_path="OpenDCAI/DataFlex-selector-openhermes-10w",#验证集 - - # If you want to use vllm,please add "vllm:" before model's name - # Otherwise it automatically use sentence-transfromer - embed_model="vllm:Qwen/Qwen3-Embedding-0.6B",#编码模型 + candidate_path="OpenDCAI/DataFlex-selector-openhermes-10w", + query_path="OpenDCAI/DataFlex-selector-openhermes-10w", + embed_model="Qwen/Qwen3-Embedding-0.6B", + # support method: + #auto(It automatically try vllm first, then sentence-transformers), + #vllm, + #sentence-transformer + embed_method="auto", batch_size=32, save_probs_path="tsds_probs.npy", max_K=5000, @@ -94,7 +96,7 @@ if __name__ == "__main__": ``` -> **注意**:此处的 `model_name` 用于将**tokenized**后的文本进一步编码为**句向量**(例如 512 维),支持vllm和sentence-transformer 推理。 +> **注意**:此处的 `model_name` 用于将**tokenized**后的文本进一步编码为**句向量**(例如 1024 维),支持vllm和sentence-transformer 推理。 **最终保存为每个训练样本的采样概率**