From f003ac00724024276d421a1e0383a7861660d693 Mon Sep 17 00:00:00 2001 From: zzy1127 <1726073424@qq.com> Date: Sun, 1 Feb 2026 18:03:28 +0800 Subject: [PATCH] fix: delete useless docs --- docs/.vuepress/notes/en/guide.ts | 12 - docs/.vuepress/notes/zh/guide.ts | 12 - .../text_evaluation_operators.md | 1556 ---------------- .../text_generate_operators.md | 313 ---- .../text_process_operators.md | 252 --- .../text_evaluation_operators.md | 1567 ----------------- .../text_generate_operators.md | 312 ---- .../text_process_operators.md | 249 --- 8 files changed, 4273 deletions(-) delete mode 100644 docs/en/notes/guide/general_operators/text_evaluation_operators.md delete mode 100644 docs/en/notes/guide/general_operators/text_generate_operators.md delete mode 100644 docs/en/notes/guide/general_operators/text_process_operators.md delete mode 100644 docs/zh/notes/guide/general_operators/text_evaluation_operators.md delete mode 100644 docs/zh/notes/guide/general_operators/text_generate_operators.md delete mode 100644 docs/zh/notes/guide/general_operators/text_process_operators.md diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts index 37a5b8cbe..6ea317d79 100644 --- a/docs/.vuepress/notes/en/guide.ts +++ b/docs/.vuepress/notes/en/guide.ts @@ -44,7 +44,6 @@ export const Guide: ThemeNote = defineNoteConfig({ 'prompted_vqa', 'mathquestion_extract', 'knowledge_cleaning', - 'quick_general_text_evaluation', 'speech_transcription', ], @@ -97,17 +96,6 @@ export const Guide: ThemeNote = defineNoteConfig({ "easy_evaluation", ] }, - { - text: "General Operators", - collapsed: false, - icon: 'material-symbols:analytics-outline', - prefix: 'general_operators', - items: [ - "text_evaluation_operators", - "text_process_operators", - "text_generate_operators", - ] - }, { text: "Domain-Specific Operators", collapsed: false, diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts index 31fed3995..f784356f8 100644 --- a/docs/.vuepress/notes/zh/guide.ts +++ b/docs/.vuepress/notes/zh/guide.ts @@ -52,7 +52,6 @@ export const Guide: ThemeNote = defineNoteConfig({ "prompted_vqa", "mathquestion_extract", 'knowledge_cleaning', - 'quick_general_text_evaluation', 'speech_transcription', ], }, @@ -95,17 +94,6 @@ export const Guide: ThemeNote = defineNoteConfig({ "easy_evaluation", ] }, - { - text: "通用算子(移动到API)", - collapsed: false, - icon: 'material-symbols:analytics-outline', - prefix: 'general_operators', - items: [ - "text_evaluation_operators", - "text_process_operators", - "text_generate_operators", - ] - }, { text: "专用算子(移动到API)", collapsed: false, diff --git a/docs/en/notes/guide/general_operators/text_evaluation_operators.md b/docs/en/notes/guide/general_operators/text_evaluation_operators.md deleted file mode 100644 index 82561aca6..000000000 --- a/docs/en/notes/guide/general_operators/text_evaluation_operators.md +++ /dev/null @@ -1,1556 +0,0 @@ ---- -title: General Data Evaluation Operators -createTime: 2025/06/09 11:43:25 -permalink: /en/guide/text_evaluation_operators/ ---- - -# Text Data Evaluation Metrics - -## Text quality evaluation - -Scorers are divided into the following four types, each scorer provides one or more scores. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TypeCountDescription
APIcaller3Call API for scoring
Diversity2Compute diversity score of the entire dataset
Models12Model or classifier-based scoring
Statistics3Statistical metric scoring
- -Regarding data types: **[Text]** indicates accepting single-field string input, suitable for pre-training or fine-tuning data. **[Instruction]** indicates only suitable for fine-tuning data with multi-field format input. - -The types of open-source operators are quite limited. In order to achieve better data processing quality and fill the gap in data evaluation methods missing in open-source, we have meticulously designed and self-developed a new set of operators. The meanings of the labels are as follows: - -🚀 Independent Innovation: Core algorithms are original developments, filling gaps in existing algorithms or further improving performance, breaking through current performance bottlenecks. - -✨ Open Source Premiere: This operator is integrated into the mainstream community framework for the first time, making it easier for more developers to use and achieve open-source sharing. - -### List of Scorers - -#### APIcaller - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameEvaluation DimensionData TypeDescriptionValue RangeOfficial Repository or Paper
AlpagasusScorer✨Content Accuracy & EffectivenessInstructionEvaluates the quality of instructions by calling GPT, returning a quality score. A higher score indicates higher instruction quality.[0, 5]paper
PerspectiveScorer✨SafetyTextUses PerspectiveAPI to evaluate the toxicity of the text, returning a toxicity probability. A higher score indicates higher text toxicity.[0, 1]API
TreeinstructScorer✨Diversity & ComplexityInstructionMeasures instruction complexity by generating the number of nodes in the syntax tree; more nodes indicate more complex instructions.-paper
- -#### Diversity - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameEvaluation DimensionData TypeDescriptionValue RangeOfficial Repository or Paper
Task2VecScorer✨Diversity & ComplexityTextEvaluates the diversity of the dataset using the Task2Vec method. Higher scores indicate higher dataset diversity.[0.0525±3.41E-4, 0.4037±1.932E-5]paper
code
VendiScorerDiversity & ComplexityTextEvaluates dataset diversity by calculating VendiScore; higher scores indicate higher diversity.-paper
code
- -#### Models - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameEvaluation DimensionData TypeDescriptionValue RangeOfficial Repository or Paper
DebertaV3Scorer✨Content Accuracy & EffectivenessTextA quality classifier based on NVIDIA's DeBERTa V3 model for evaluating text quality.{Low, Medium, High}code
FineWebEduScorer✨Educational ValueTextA classifier for evaluating the educational value of text; higher scores indicate higher educational value.[0, 5]paper
code
InstagScorer✨Diversity & ComplexityInstructionEvaluates instruction content diversity by returning the number of tags; more tags indicate higher content diversity.-paper
code
PerplexityScorerFluency & UnderstandabilityTextCalculates text perplexity using the KenLM model; lower scores indicate higher fluency and understandability.-paper
code
QuratingScorer✨Content Accuracy & Effectiveness、 Educational ValueTextEvaluates text quality using the Qurating model; higher scores indicate higher quality.-paper
code
PairQualScorer🚀Educational ValueTextEvaluates the quality of text using the PairQual model, based on the BGE model. It supports both Chinese and English. It is trained by scoring pairwise comparisons of texts using GPT. A higher score indicates better quality.-
code
PresidioScorer✨SafetyTextUsing the Microsoft Presidio model, identify private entities (PII) in text such as credit card numbers, names, locations, etc. The scorer returns the number of PII information.-code
SuperfilteringScorer✨Fluency & UnderstandabilityInstructionEvaluates the following difficulty of instructions using the Superfiltering method; higher scores indicate more difficult instructions to follow.-paper
code
TextbookScorer✨Educational ValueTextA textbook quality classifier based on FastText, used to evaluate the educational value of text.[0, 2]paper
code
DeitaQualityScorer✨Content Accuracy & EffectivenessInstructionAn instruction quality scorer based on the Llama model; higher scores indicate higher instruction quality.[1, 6]paper
code
DeitaComplexityScorer✨Diversity & ComplexityInstructionAn instruction complexity scorer based on the Llama model; higher scores indicate higher instruction complexity.[1,6]paper
code
RMScorer✨Fluency & Understandability指令A reward-model-deberta-v3-large-v2 scorer based on human value judgment. High scores represent higher quality.-code
- -#### Statistics - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameEvaluation DimensionData TypeDescriptionValue RangeOfficial Repository or Paper
LangkitScorerText Structure, Fluency & UnderstandabilityTextCalculates statistical information of text using the Langkit toolkit, such as word count, sentence count, syllable count, etc., to help evaluate the structural complexity and readability of the text.-code
LexicalDiversityScorer✨Diversity & ComplexityTextCalculates lexical diversity scores using MTLD and HD-D methods; higher scores represent richer vocabulary use, reflecting the diversity and complexity of the text.-paper
code
NgramScorerDiversity & ComplexityTextCalculates the repetition ratio of n-grams in the text to measure text repetition; higher scores indicate lower repetition of n-grams in the text.[0, 1]-
- -### Quality Evaluation System - -To provide more precise data quality evaluation, we have constructed a quality evaluation system based on existing classifiers. Specifically, the output score metrics of each scorer include the following six dimensions. - -#### 1. Text Structure - -- **LangkitScorer**: LangkitSentenceCountScore, LangkitCharacterCountScore, LangkitLetterCountScore, LangkitSyllableCountScore, LangkitPolysyllableCountScore, LangkitMonosyllableCountScore, LangkitLexiconCountScore, LangkitDifficultWordsScore - -#### 2. Diversity & Complexity - -- **LexicalDiversityScorer**: LexicalDiversityMTLDScore, LexicalDiversityHD-DScore -- **NgramScorer**: NgramScore -- **InstagScorer**: InstagScore -- **TreeinstructScorer**: TreeinstructScore -- **Task2VecScorer**: Task2VecDiversityScore (ConfidenceInterval) -- **VendiScorer**: N-gramsVendiScore, BERTVendiScore, SimCSEVendiScore -- **DeitaComplexityScorer:** DeitaComplexityScore - -#### 3. Fluency & Understandability - -- **LangkitScorer**: LangkitFleschReadingEaseScore, LangkitAutomatedReadabilityIndexScore, LangkitAggregateReadingLevelScore -- **PerplexityScorer**: PerplexityScore -- **QuratingScorer**: QuratingWritingStyleScore -- **SuperfilteringScorer**: SuperfilteringScore -- **RMScorer**: RMScore - -#### 4. Safety - -- **PerspectiveScorer**: PerspectiveScore -- **PresidioScorer**: PresidioScore - -#### 5. Educational Value - -- **TextbookScorer**: TextbookScore -- **FineWebEduScorer**: FineWebEduScore -- **QuratingScorer**: QuratingEducationalValueScore -- **PairQualScorer**: PairQualScore - -#### 6. Content Accuracy & Effectiveness - -- **QuratingScorer**: QuratingRequiredExpertiseScore, QuratingFactsAndTriviaScore -- **DebertaV3Scorer**: DebertaV3Score -- **AlpagasusScorer**: AlpagasusScore -- **DeitaScorer**: DeitaScore - -### Benchmark Values - -To better provide data quality references, we randomly selected 5k data samples from the currently considered high-quality datasets [Fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) and [alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned) based on data types, and tested the benchmark values of some scorers. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Scorer NameScore Metric NameDescriptionMeanVarianceMaxMin
PerspectiveScorerPerspectiveScoreEvaluates the toxicity of the text, checking for potential insults or inappropriate language. The higher the score, the higher the toxicity0.04260.00250.26100.0026
LexicalDiversityScorerLexicalDiversityMTLDScoreMeasures the lexical diversity of the text; higher scores indicate more varied vocabulary usage.The higher the score, the higher the lexical diversity100.59901625.13181165.716414.8439
LexicalDiversityHD-DScoreUsed to measure the lexical diversity of the text, calculated based on discrete distribution.The higher the score, the higher the lexical diversity0.84870.00140.98730.5570
NgramScorerNgramScoreCalculate the repetition ratio of n-grams in the text to measure the degree of repetition. The higher the score, the lower the n-gram repetition.0.99380.00021.00.8285
LangkitScorerLangkitFleschReadingEaseScoreMeasures Flesch text readability. The higher the score, the easier readability.55.1870324.8975106.37-144.75
LangkitAutomatedReadabilityIndexScoreAutomated readability index based on sentence length and vocabulary difficulty.The higher the score, the more difficult readability11.772719.411798.20.9
LangkitAggregateReadingLevelScoreAggregate reading difficulty score of the text.The higher the score, the more difficult readability11.233213.681677.00.0
LangkitSyllableCountScoreCounts the total number of syllables in the text. The higher the score, the more syllables there are.815.38522299853.72724323732
LangkitLexiconCountScoreCounts the total number of words in the text. The higher the score, the more words there are.524.1781061058.58753303323
LangkitSentenceCountScoreCounts the total number of sentences in the text. The higher the score, the more sentences there are.28.96643618.254921931
LangkitCharacterCountScoreCounts the total number of characters in the text. The higher the score, the more characters there are.2610.246223580442.8820139807118
LangkitLetterCountScoreCounts the total number of letters in the text. The higher the score, the more letters there are.2513.457221890120.2030134507109
LangkitPolysyllableCountScoreCounts the number of polysyllabic words in the text. The higher the score, the more polysyllabic words there are.78.883418918.199032610
LangkitMonosyllableCountScoreCounts the number of monosyllabic words, which are usually related to the text's simplicity. The higher the score, the more monosyllabic words there are.334.6674503285.51602513313
LangkitDifficultWordsScoreCounts the number of difficult words in the text. The higher the score, the more difficult words there are.93.411214401.278923664
TextbookScorerTextbookScoreTests whether the text meets textbook standards. The higher the score, the closer the text is to an ideal textbook.0.92550.17791.98670.0001
FineWebEduScorerFineWebEduScoreMeasures the educational value of the text. The higher the score, the greater the educational value.1.19010.49244.6827-0.6319
DebertaV3ScorerDebertaV3ScoreText evaluation using the DebertaV3 model. Quality scores are classified as high, medium, or low.Medium: 3180 times-High: 1412 timesLow: 408 times
PerplexityScorerPerplexityScoreMeasures the perplexity of the text. The higher the score, the greater the model's perplexity.564.3942165893.55428271.013.9
QuratingScorerQuratingWritingStyleScoreEvaluates the quality of the text's writing style. The higher the score, the better the writing style.0.64536.79498.375-7.3474
QuratingRequiredExpertiseScoreMeasures the level of expertise required for the text. The higher the score, the more expertise is required.-0.46617.04589.0-8.25
QuratingFactsAndTriviaScoreTests whether the text contains facts and trivia. The higher the score, the more facts and trivia the text contains.0.18894.56787.4688-6.0993
QuratingEducationalValueScoreMeasures the educational value of the text. The higher the score, the greater the educational value.1.294611.219611.5625-8.7843
InstagScorerInstagScoreEvaluates the content diversity by returning the number of tags. The higher the score, the greater the content diversity.2.3042.9396111
SuperfilteringScorerSuperfilteringScoreEvaluates the instruction-following difficulty using the Superfiltering method. The higher the score, the more difficult it is to follow the instructions.1.3223836.03021978.65340.0011
DeitaQualityScorerDeitaQualityScoreInstruction quality evaluation based on the Llama model. The higher the score, the better the quality of the instructions.3.56290.92475.53091.0840
DeitaComplexityScorerDeitaComplexityScoreInstruction complexity evaluation based on the Llama model. The higher the score, the greater the complexity of the instructions.1.49360.20863.32071.0001
VendiScorerN-grams_VendiScoreEvaluates text diversity based on N-grams embeddings. The higher the score, the greater the dataset diversity.1832.96---
BERT_VendiScoreEvaluates text diversity based on BERT embeddings. The higher the score, the greater the dataset diversity.1.83---
SimCSE_VendiScoreEvaluates text diversity based on SimCSE embeddings. The higher the score, the greater the dataset diversity.68.94---
Task2VecScorerTask2VecScoreEvaluates dataset diversity using Task2Vec diversity coefficient. The higher the score, the greater the dataset diversity.0.0673---
AlpagasusScorerAlpagasusScoreEvaluates instruction quality using ChatGPT. The higher the score, the better the quality of the instructions.4.1720.21645.02.0
TreeinstructScorerTreeinstructScoreUses ChatGPT to evaluate the semantic complexity of instructions. The higher the score, the greater the semantic complexity of the instruction.6.4949.754063.00.0
PresidioScorerPresidioScoreUses Presidio to evaluate the number of PII (Personally Identifiable Information) instances. The higher the score, the more PII information is present in the text.21.40082915.35421786.00.0
RMScorerRMScoreUses a reward model based on human values to evaluate the quality of SFT (Supervised Fine-Tuning) data. The higher the score, the better the data quality.3.15379.94618.6803-4.9680
- -## Detailed Operator Descriptions - -### APIcaller Operators - -#### 1. AlpagasusScorer✨ - -**Function Description:** This operator evaluates instruction quality using GPT, returning a quality score where higher scores indicate better instruction quality. Based on the Alpagasus method, it is specifically designed for evaluating the quality and effectiveness of instruction data. - -**Input Parameters:** - -- `__init__()` - - `llm_serving`: Large language model interface object to use (required, must implement LLMServingABC interface) - - `dimension`: Evaluation dimension (default: "quality") -- `run()` - - `storage`: Storage interface object - - `input_instruction_key`: Field name for instruction - - `input_input_key`: Field name for input text - - `input_output_key`: Field name for output text - - `output_key`: Field name for output score (default: "AlpagasusScore") - -**Key Features:** - -- GPT-based intelligent quality assessment -- Support for custom evaluation dimensions -- Automatic score parsing -- Suitable for instruction fine-tuning data quality evaluation - -**Usage Example:** - -```python -alpagasus_scorer = AlpagasusScorer( - llm_serving=api_llm_serving, - dimension="quality" - ) -alpagasus_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - input_input_key="input", - input_output_key="output", - output_key="AlpagasusScore" - ) -``` - -#### 2. PerspectiveScorer✨ - -**Function Description:** This operator assesses text toxicity using PerspectiveAPI, returning toxicity probability where higher scores indicate more toxicity. Specifically designed for detecting harmful content and inappropriate language in text. - -**Input Parameters:** - -- `__init__()` - - `serving`: Perspective API serving object -- `run()` - - `storage`: Storage interface object - - `input_key`: Field name for input text - - `output_key`: Field name for output score (default: "PerspectiveScore") - -**Key Features:** - -- Google Perspective API-based toxicity detection -- Automatic text length limit handling (max 20KB) -- Batch processing support -- Returns 0-1 range toxicity probability - -**Usage Example:** - -```python -perspective_scorer = PerspectiveScorer(serving=perspective_api_serving) -perspective_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="PerspectiveScore" - ) -``` - -#### 3. TreeinstructScore✨ - -**Function Description:** This operator measures instruction complexity by generating syntax tree node counts; more nodes indicate higher complexity. Based on syntax analysis methods to evaluate the structural complexity of instructions. - -**Input Parameters:** - -- `__init__()` - - No special parameters required -- `run()` - - `storage`: Storage interface object - - `input_instruction_key`: Field name for instruction - - `output_key`: Field name for output score (default: "TreeinstructScore") - -**Key Features:** - -- Syntax tree analysis-based complexity evaluation -- Automatic instruction syntax structure parsing -- Quantified instruction complexity -- Suitable for instruction diversity analysis - -**Usage Example:** - -```python -treeinstruct_scorer = TreeinstructScore() -treeinstruct_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - output_key="TreeinstructScore" - ) -``` - - -### Diversity Operators - -#### 1. Task2VecScorer✨ - -**Function Description:** This operator assesses dataset diversity using the Task2Vec method; higher scores indicate greater diversity. Based on task embedding methods to calculate similarity and diversity between datasets. - -**Input Parameters:** - -- `__init__()` - - No special parameters required -- `run()` - - `storage`: Storage interface object - - `input_key`: Field name for input text - -**Key Features:** - -- Task2Vec method-based diversity evaluation -- Confidence interval calculation -- Suitable for task-level diversity analysis -- Open source first algorithm - -**Usage Example:** - -```python -task2vec_scorer = Task2VecScorer() -result = task2vec_scorer.run( - storage=self.storage.step(), - input_key="text" - ) -``` - -#### 2. VendiScorer - -**Function Description:** This operator assesses dataset diversity using VendiScore with embeddings from BERT and SimCSE models. VendiScore is a diversity measurement method based on kernel matrix eigenvalues that can effectively evaluate dataset richness and coverage. - -**Input Parameters:** - -- `__init__()` - - `device`: Computing device (default: "cuda") -- `run()` - - `storage`: Storage interface object - - `input_key`: Field name for input text - -**Key Features:** - -- Multi-model evaluation: Uses BERT, SimCSE, and N-gram methods -- Embedding-based diversity calculation -- Suitable for entire dataset diversity evaluation -- GPU acceleration support - -**Output Format:** - -- `N-gramsVendiScore`: N-gram-based diversity score -- `BERTVendiScore`: BERT-based diversity score -- `SimCSEVendiScore`: SimCSE-based diversity score - -**Usage Example:** - -```python -vendi_scorer = VendiScorer(device="cuda") -result = vendi_scorer.run( - storage=self.storage.step(), - input_key="text" - ) -``` - - -### Models Operators - - -#### 1. DebertaV3Scorer✨ - -**Function Description:** A text quality classifier based on Nvidia Deberta V3 model for evaluating text quality. This operator classifies text into three quality levels: High, Medium, and Low, suitable for large-scale text quality filtering. - -**Input Parameters:** - -- `__init__()` - - `device`: Computing device (default: "cuda") - - `model_cache_dir`: Model cache directory (default: "./dataflow_cache") - - `batch_size`: Batch processing size (default: 32) -- `run()` - - `storage`: Storage interface object - - `input_key`: Input text field name - - `output_key`: Output score field name (default: "DebertaV3Score") - -**Key Features:** - -- High-precision text quality classification based on DeBERTa-v3-large model -- Three-level quality classification: High, Medium, Low -- Supports batch processing for improved efficiency -- GPU-accelerated computation -- Suitable for quality evaluation of various text types - -**Evaluation Dimension:** Content Accuracy & Effectiveness - -**Data Type:** Text - -**Value Range:** \{Low, Medium, High\} - -**Usage Example:** - -```python -deberta_scorer = DebertaV3Scorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=32 -) -deberta_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="DebertaV3Score" -) -``` - -#### 2. FineWebEduScorer✨ - -**Function Description:** A classifier for evaluating the educational value of text, trained on the FineWeb-Edu dataset. This operator can identify educationally meaningful text content, providing support for educational resource filtering and curriculum content development. - -**Input Parameters:** - -- `__init__()` - - `device`: Computing device (default: "cuda") - - `model_cache_dir`: Model cache directory (default: "./dataflow_cache") - - `batch_size`: Batch processing size (default: 32) -- `run()` - - `storage`: Storage interface object - - `input_key`: Input text field name - - `output_key`: Output score field name (default: "FineWebEduScore") - -**Key Features:** - -- Specifically designed for educational value assessment -- Trained on large-scale educational text data -- Fine-grained scoring from 0-5 -- Supports multilingual text evaluation -- Efficient batch processing capability - -**Evaluation Dimension:** Educational Value - -**Data Type:** Text - -**Value Range:** [0, 5] - -**Usage Example:** - -```python -fineweb_edu_scorer = FineWebEduScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=32 -) -fineweb_edu_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="FineWebEduScore" -) -``` - -#### 3. InstagScorer✨ - -**Function Description:** Evaluates instruction content diversity by returning the number of tags; more tags indicate greater content diversity. This operator is based on the InsTagger model and can automatically identify different topics and task types involved in instructions. - -**Input Parameters:** - -- `__init__()` - - `device`: Computing device (default: "cuda") - - `model_cache_dir`: Model cache directory (default: "./dataflow_cache") - - `batch_size`: Batch processing size (default: 16) -- `run()` - - `storage`: Storage interface object - - `input_instruction_key`: Instruction field name (default: "instruction") - - `output_key`: Output score field name (default: "InstagScore") - -**Key Features:** - -- Multi-label classification based on InsTagger model -- Automatically identifies task types and topics involved in instructions -- Quantifies instruction content diversity -- Supports fine-grained analysis of complex instructions -- Suitable for diversity evaluation of instruction datasets - -**Evaluation Dimension:** Diversity & Complexity - -**Data Type:** Instruction - -**Value Range:** Positive integer (number of tags) - -**Usage Example:** - -```python -instag_scorer = InstagScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=16 -) -instag_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - output_key="InstagScore" -) -``` - -#### 4. PerplexityScorer - -**Function Description:** Calculates text perplexity based on Kenlm model; lower perplexity indicates higher fluency and understandability. This operator uses statistical language models to evaluate text naturalness and language quality. - -**Input Parameters:** - -- `__init__()` - - `model_path`: Kenlm model path (default: preset model path) - - `language`: Language type (default: "en") -- `run()` - - `storage`: Storage interface object - - `input_key`: Input text field name - - `output_key`: Output score field name (default: "PerplexityScore") - -**Key Features:** - -- Based on n-gram statistical language model -- Fast text perplexity calculation -- Supports multiple languages -- Low memory usage with high computational efficiency -- Suitable for large-scale text fluency evaluation - -**Evaluation Dimension:** Fluency & Understandability - -**Data Type:** Text - -**Value Range:** Positive number (perplexity value, lower is better) - -**Usage Example:** - -```python -perplexity_scorer = PerplexityScorer( - model_path="./models/kenlm_model.bin", - language="en" -) -perplexity_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="PerplexityScore" -) -``` - - -#### 5. QuratingScorer✨ - -**Function Description:** Evaluates text quality through the Qurating model; higher scores indicate better quality. This operator is based on a multi-dimensional evaluation framework and can assess text quality from multiple perspectives including writing style, educational value, and required expertise. - -**Input Parameters:** - -- `__init__()` - - `device`: Computing device (default: "cuda") - - `model_cache_dir`: Model cache directory (default: "./dataflow_cache") - - `batch_size`: Batch processing size (default: 16) - - `max_length`: Maximum sequence length (default: 512) -- `run()` - - `storage`: Storage interface object - - `input_key`: Input text field name - - `output_key`: Output score field name (default: "QuratingScore") - -**Key Features:** - -- Multi-dimensional text quality evaluation -- Trained on large-scale high-quality text -- Supports long text processing -- Provides fine-grained quality scoring -- Suitable for academic and professional text evaluation - -**Evaluation Dimension:** Content Accuracy & Effectiveness, Educational Value - -**Data Type:** Text - -**Value Range:** Continuous values (higher is better) - -**Output Metrics:** -- `QuratingWritingStyleScore`: Writing style score -- `QuratingEducationalValueScore`: Educational value score -- `QuratingRequiredExpertiseScore`: Required expertise score -- `QuratingFactsAndTriviaScore`: Facts and knowledge score - -**Usage Example:** - -```python -qurating_scorer = QuratingScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=16, - max_length=512 -) -qurating_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="QuratingScore" -) -``` - -#### 6. PairQualScorer🚀 - -**Function Description:** Evaluates text quality through the PairQual model based on bge model, supporting Chinese and English, trained with GPT pairwise comparison scoring. This is an independently innovative operator specifically optimized for Chinese and English text quality evaluation. - -**Input Parameters:** - -- `__init__()` - - `device`: Computing device (default: "cuda") - - `model_cache_dir`: Model cache directory (default: "./dataflow_cache") - - `batch_size`: Batch processing size (default: 32) - - `language`: Language type (default: "auto", auto-detection) -- `run()` - - `storage`: Storage interface object - - `input_key`: Input text field name - - `output_key`: Output score field name (default: "PairQualScore") - -**Key Features:** - -- Bilingual quality evaluation based on BGE model -- Trained with GPT pairwise comparison data -- Supports Chinese and English evaluation -- Independent innovation algorithm -- High-precision quality judgment capability - -**Evaluation Dimension:** Educational Value - -**Data Type:** Text - -**Value Range:** Continuous values (higher is better) - -**Usage Example:** - -```python -pairqual_scorer = PairQualScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=32, - language="auto" -) -pairqual_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="PairQualScore" -) -``` - -#### 7. PresidioScorer✨ - -**Function Description:** Uses Microsoft Presidio model to identify personally identifiable information (PII) in text such as credit card numbers, names, locations, etc. The scorer returns the count of PII information for evaluating text privacy safety. - -**Input Parameters:** - -- `__init__()` - - `language`: Language type (default: "en") - - `entities`: List of entity types to detect (default: ["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "CREDIT_CARD", "LOCATION"]) -- `run()` - - `storage`: Storage interface object - - `input_key`: Input text field name - - `output_key`: Output score field name (default: "PresidioScore") - -**Key Features:** - -- PII detection based on Microsoft Presidio -- Supports recognition of multiple personal information types -- Customizable entity types for detection -- Supports multilingual text processing -- High-precision privacy information identification - -**Evaluation Dimension:** Safety - -**Data Type:** Text - -**Value Range:** Non-negative integer (number of PII entities) - -**Detected PII Types:** -- PERSON: Person names -- EMAIL_ADDRESS: Email addresses -- PHONE_NUMBER: Phone numbers -- CREDIT_CARD: Credit card numbers -- LOCATION: Geographic locations -- Other configurable types - -**Usage Example:** - -```python -presidio_scorer = PresidioScorer( - language="en", - entities=["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "CREDIT_CARD", "LOCATION"] -) -presidio_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="PresidioScore" -) -``` - -#### 8. SuperfilteringScorer✨ - -**Function Description:** Uses Superfiltering method to evaluate instruction following difficulty; higher scores indicate instructions are harder to follow. This operator is based on instruction complexity analysis and helps identify instructions requiring advanced reasoning capabilities. - -**Input Parameters:** - -- `__init__()` - - `device`: Computing device (default: "cuda") - - `model_cache_dir`: Model cache directory (default: "./dataflow_cache") - - `batch_size`: Batch processing size (default: 16) -- `run()` - - `storage`: Storage interface object - - `input_instruction_key`: Instruction field name (default: "instruction") - - `input_output_key`: Output field name (default: "output") - - `output_key`: Output score field name (default: "SuperfilteringScore") - -**Key Features:** - -- Difficulty evaluation based on Superfiltering method -- Evaluates instruction following complexity -- Identifies instructions requiring advanced reasoning -- Supports instruction-response pair analysis -- Suitable for instruction data quality filtering - -**Evaluation Dimension:** Fluency & Understandability - -**Data Type:** Instruction - -**Value Range:** Continuous values (higher indicates harder to follow) - -**Usage Example:** - -```python -superfiltering_scorer = SuperfilteringScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=16 -) -superfiltering_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - input_output_key="output", - output_key="SuperfilteringScore" -) -``` - -#### 9. TextbookScorer✨ - -**Function Description:** A textbook quality classifier based on FastText classifier for evaluating educational value of text. This operator is specifically designed for educational content and can identify text with textbook quality. - -**Input Parameters:** - -- `__init__()` - - `model_path`: FastText model path (default: preset model path) - - `threshold`: Classification threshold (default: 0.5) -- `run()` - - `storage`: Storage interface object - - `input_key`: Input text field name - - `output_key`: Output score field name (default: "TextbookScore") - -**Key Features:** - -- Efficient text classification based on FastText -- Specifically optimized for educational content -- Fast inference speed -- Low memory usage -- Suitable for large-scale educational text filtering - -**Evaluation Dimension:** Educational Value - -**Data Type:** Text - -**Value Range:** [0, 2] - -**Classification Standards:** -- 0: Non-educational content -- 1: General educational content -- 2: High-quality educational content - -**Usage Example:** - -```python -textbook_scorer = TextbookScorer( - model_path="./models/textbook_classifier.bin", - threshold=0.5 -) -textbook_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="TextbookScore" -) -``` - -#### 10. DeitaQualityScorer✨ - -**Function Description:** A Llama-based Deita instruction quality evaluator; higher scores indicate better instruction quality. This operator evaluates instruction quality by generating 1-6 quality scores, particularly suitable for quality filtering of instruction fine-tuning data. - -**Input Parameters:** - -- `__init__()` - - `device`: Computing device (default: "cuda") - - `model_cache_dir`: Model cache directory (default: "./dataflow_cache") - - `max_length`: Maximum sequence length (default: 512) - - `batch_size`: Batch processing size (default: 8) -- `run()` - - `storage`: Storage interface object - - `input_instruction_key`: Instruction text field name (default: "instruction") - - `input_output_key`: Output text field name (default: "output") - - `output_key`: Output score field name (default: "DeitaQualityScore") - -**Key Features:** - -- Professional quality evaluation based on Llama model -- Fine-grained 1-6 scoring -- Uses softmax probability distribution to calculate final score -- Supports batch processing and GPU acceleration -- Specifically optimized for instruction-response pairs - -**Evaluation Dimension:** Content Accuracy & Effectiveness - -**Data Type:** Instruction - -**Value Range:** [1, 6] - -**Scoring Standards:** -- 1 point: Very poor quality, unclear instructions or irrelevant responses -- 2 points: Poor quality, obvious problems exist -- 3 points: Average quality, basically usable but with room for improvement -- 4 points: Good quality, clear instructions and appropriate responses -- 5 points: Very good quality, high-quality instruction-response pairs -- 6 points: Excellent quality, perfect instruction-response pairs - -**Usage Example:** - -```python -deita_quality_scorer = DeitaQualityScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - max_length=512, - batch_size=8 -) -deita_quality_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - input_output_key="output", - output_key="DeitaQualityScore" -) -``` - -#### 11. DeitaComplexityScorer✨ - -**Function Description:** A Llama-based Deita instruction complexity evaluator; higher scores indicate greater instruction complexity. This operator evaluates the cognitive complexity and execution difficulty of instructions, helping identify challenging instructions. - -**Input Parameters:** - -- `__init__()` - - `device`: Computing device (default: "cuda") - - `model_cache_dir`: Model cache directory (default: "./dataflow_cache") - - `max_length`: Maximum sequence length (default: 512) - - `batch_size`: Batch processing size (default: 8) -- `run()` - - `storage`: Storage interface object - - `input_instruction_key`: Instruction text field name (default: "instruction") - - `input_output_key`: Output text field name (default: "output") - - `output_key`: Output score field name (default: "DeitaComplexityScore") - -**Key Features:** - -- Complexity evaluation based on Llama model -- 1-6 complexity scoring -- Evaluates cognitive load of instructions -- Identifies instructions requiring advanced reasoning -- Supports difficulty stratification of instruction datasets - -**Evaluation Dimension:** Diversity & Complexity - -**Data Type:** Instruction - -**Value Range:** [1, 6] - -**Complexity Standards:** -- 1 point: Very simple, basic operations -- 2 points: Simple, direct tasks -- 3 points: Medium, requires some thinking -- 4 points: Complex, requires multi-step reasoning -- 5 points: Very complex, requires advanced reasoning -- 6 points: Extremely complex, requires professional knowledge - -**Usage Example:** - -```python -deita_complexity_scorer = DeitaComplexityScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - max_length=512, - batch_size=8 -) -deita_complexity_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - input_output_key="output", - output_key="DeitaComplexityScore" -) -``` - -#### 12. RMScorer✨ - -**Function Description:** A quality scorer based on human value judgment reward model reward-model-deberta-v3-large-v2. Higher scores represent better quality. This operator uses reward models trained with human feedback to evaluate text quality. - -**Input Parameters:** - -- `__init__()` - - `device`: Computing device (default: "cuda") - - `model_cache_dir`: Model cache directory (default: "./dataflow_cache") - - `batch_size`: Batch processing size (default: 16) - - `max_length`: Maximum sequence length (default: 512) -- `run()` - - `storage`: Storage interface object - - `input_instruction_key`: Instruction field name (default: "instruction") - - `input_output_key`: Output field name (default: "output") - - `output_key`: Output score field name (default: "RMScore") - -**Key Features:** - -- Reward model trained with human feedback -- Reflects human value judgments and preferences -- Suitable for dialogue and instruction response evaluation -- High-precision quality judgment -- Supports multi-turn dialogue evaluation - -**Evaluation Dimension:** Fluency & Understandability - -**Data Type:** Instruction - -**Value Range:** Continuous values (higher indicates better quality) - -**Evaluation Standards:** -- Considers response helpfulness -- Evaluates content safety -- Judges answer accuracy -- Measures expression clarity - -**Usage Example:** - -```python -rm_scorer = RMScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=16, - max_length=512 -) -rm_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - input_output_key="output", - output_key="RMScore" -) -``` - -### Statistics Operators - -#### 1. LexicalDiversityScorer ✨ - -**Function Description:** -This operator computes the lexical diversity of a text using the MTLD (Measure of Textual Lexical Diversity) and HDD (Hypergeometric Distribution Diversity) methods to evaluate the richness of vocabulary and expressive variety. - -**Input Parameters:** - -* `__init__()` - - * No special parameters required. -* `run()` - - * `storage`: Storage interface object - * `input_key`: Field name of the input text - -**Key Features:** - -* **MTLD method**: Measures how many words are needed to maintain a specified TTR (type-token ratio) threshold, assessing lexical diversity. -* **HDD method**: Estimates lexical richness based on sampling using hypergeometric distribution. -* Automatically handles punctuation and casing. -* Adaptively evaluates texts of varying lengths. - -**Input Requirements:** - -* MTLD evaluation: Requires texts longer than 50 words. -* HDD evaluation: Text length should be between 50 and 1000 words. - -**Output Format:** - -* `LexicalDiversityMTLDScore`: MTLD diversity score (higher = better diversity) -* `LexicalDiversityHD-DScore`: HDD diversity score (higher = better diversity) - -**Usage Example:** - -```python -lexical_scorer = LexicalDiversityScorer() -lexical_scorer.run( - storage=self.storage.step(), - input_key="text" -) -``` - ---- - -#### 2. LangkitScorer - -**Function Description:** -This operator uses the Langkit toolkit to compute statistical information about a text, such as word count, sentence count, and syllable count, aiding in the assessment of structural complexity and readability. - -**Input Parameters:** - -* `__init__()` - - * No special parameters required. -* `run()` - - * `storage`: Storage interface object - * `input_key`: Field name of the input text - -**Key Features:** - -* Comprehensive statistical analysis of text -* Multi-dimensional readability evaluation -* Includes Flesch readability score -* Automated readability metrics computation - -**Output Metrics:** - -* Structure: Sentence count, character count, letter count, word count -* Complexity: Syllable count, number of polysyllabic/monosyllabic/difficult words -* Readability: Flesch Reading Ease score, Automated Readability Index, overall reading difficulty - -**Usage Example:** - -```python -langkit_scorer = LangkitScorer() -langkit_scorer.run( - storage=self.storage.step(), - input_key="text" -) -``` - ---- - -#### 3. NgramScorer - -**Function Description:** -This operator calculates the repetition ratio of n-grams within a text, measuring how repetitive it is. Higher scores indicate lower n-gram repetition. - -**Input Parameters:** - -* `__init__()` - - * `n`: Length of the n-gram (default: 3) -* `run()` - - * `storage`: Storage interface object - * `input_key`: Field name of the input text - * `output_key`: Field name for the output score (default: `"NgramScore"`) - -**Key Features:** - -* Repetition analysis based on n-grams -* Configurable n-gram length -* Quantifies textual diversity -* High computational efficiency - -**Usage Example:** - -```python -ngram_scorer = NgramScorer(n=3) -ngram_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="NgramScore" -) -``` - -## Generated text evaluation - -Dataflow integrates three methods for evaluating the quality of generated text, used to evaluate the similarity between generated text and reference text. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Scorer NameDescriptionValue RangeDescription
BLEU ScorerCalculates precision based on n-gram matching by comparing n-grams in generated and reference texts[0, 1]Higher values indicate greater match between generated and reference texts
CIDEr ScorerUses TF-IDF weighted n-gram statistics to compare similarity between generated and reference descriptions[0, 1]Higher values indicate stronger content consistency between generated and reference texts
BertScoreComputes similarity of word embeddings between generated and reference texts using BERT[0, 1]Higher values indicate stronger semantic similarity between generated and reference texts
diff --git a/docs/en/notes/guide/general_operators/text_generate_operators.md b/docs/en/notes/guide/general_operators/text_generate_operators.md deleted file mode 100644 index c9e1ff83f..000000000 --- a/docs/en/notes/guide/general_operators/text_generate_operators.md +++ /dev/null @@ -1,313 +0,0 @@ ---- -title: General Generate Operators -createTime: 2025/06/24 21:49:55 -permalink: /en/guide/text_generate_operators/ ---- - -# Text Data Generation -Currently, Dataflow integrates five text data generators, covering various formats such as pretraining document data, SFT-format data, and multi-turn dialogues. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameApplicable TypeDescriptionRepository or Paper
PretrainGeneratorPretrainSynthesize phi-4 question and answer data pairs using pre trained document data, and retell the document in QA formatPaper
SFTGeneratorSeedSFTSynthesize SFT format QA data pairs based on seed documents and return original information-
CondorGeneratorSFTTwo-stage synthesis of SFT-format data from scratch based on preset knowledge tree labels (recommend increasing label variety if generating more than 5000 samples)paper
PromptedGenerator-Generate data based on user-defined prompts-
ConsistentChatGeneratorMulti-turn DialogueTwo-stage synthesis of multi-turn dialogue data from scratch based on preset topics and human intents (recommend increasing label variety if generating more than 9000 samples)paper
- -## Operator Interface Usage Instructions - -Specifically, for operators that specify storage paths or call models, we provide encapsulated **model interfaces** and **storage object interfaces**. You can predefine model API parameters for operators in the following way: - -```python -from dataflow.llmserving import APILLMServing_request - -api_llm_serving = APILLMServing_request( - api_url="your_api_url", - model_name="model_name", - max_workers=5 - ) -``` - -You can predefine storage parameters for operators in the following way: - -```python -from dataflow.utils.storage import FileStorage - - self.storage = FileStorage( - first_entry_file_name="your_file_path", - cache_path="./cache", - file_name_prefix="dataflow_cache_step", - cache_type="jsonl", # jsonl, json, ... - ) -``` - -The `api_llm_serving` and `self.storage` used in the following text are the interface objects defined here. Complete usage examples can be found in `test/test_general_text.py`. - -For parameter passing, the constructor of operator objects mainly passes information related to operator configuration, which can be configured once and called multiple times; while the `X.run()` function passes `key` information related to IO. Details can be seen in the operator description examples below. - - -## Detailed Operator Descriptions - -### 1. PretrainGenerator✨ - -**Function Description:** This operator is specifically designed to generate pretraining format multi-turn dialogue Q&A data based on given document content. It converts raw document content into dialogue format data suitable for language model pretraining by calling large language models to reorganize and express document content. - -**Input Parameters:** - -- `__init__()` - - `llm_serving`: Large language model interface object to use (required, must implement LLMServingABC interface) -- `run()` - - `storage`: Storage interface object (default: predefined value above) - - `input_key`: Input document content field name (default: "raw_content") - - `output_key`: Output generated content field name (default: "generated_content") - -**Key Features:** - -- Supports content conversion for multiple document formats -- Automatically generates dialogue format data suitable for pretraining -- Maintains integrity of core information from original documents -- Supports batch processing of large-scale document data - -**Usage Example:** - -```python -from dataflow.prompts.general_text import PretrainGeneratorPrompt - -pretrain_gen = PretrainGenerator( - llm_serving=api_llm_serving - ) -result = pretrain_gen.run( - storage=self.storage.step(), - input_key="raw_content", - output_key="generated_content" - ) -``` - - -### 2. SFTGeneratorSeed✨ - -**Function Description:** This operator generates supervised fine-tuning format Q&A data based on given document content and supports user-defined content generation requirements. It extracts information from raw documents to generate instruction-response pairs in SFT format, particularly suitable for building high-quality supervised fine-tuning datasets. - -**Input Parameters:** - -- `__init__()` - - `llm_serving`: Large language model interface object to use (required, must implement LLMServingABC interface) - - `custom_prompt`: User-defined custom prompt (required, defines specific requirements for generated content) -- `run()` - - `storage`: Storage interface object (default: predefined value above) - - `input_key`: Input document content field name (default: "raw_content") - -**Key Features:** - -- Supports user-defined content generation requirements -- Automatically extracts and parses JSON format instruction-response pairs -- Preserves original document content for traceability -- Intelligently filters invalid generation results -- Supports long text generation up to 4096 tokens - -**Output Format:** - -- DataFrame containing 'instruction', 'output', and 'raw_content' fields -- Returns list containing 'instruction' and 'output' field names - -**Usage Example:** - -```python -from dataflow.prompts.general_text import SFTGeneratorSeedPrompt - -sft_gen = SFTGeneratorSeed( - llm_serving=api_llm_serving, - custom_prompt="Please generate educational Q&A pairs based on document content" - ) -result_keys = sft_gen.run( - storage=self.storage.step(), - input_key="raw_content" - ) -``` - - -### 3. CondorGenerator✨🚀 - -**Function Description:** This operator generates SFT format data from scratch through a two-stage process based on predefined knowledge tree tags. The first stage generates questions of varying difficulty levels (Easy, Medium, Hard) based on randomly selected topics, domains, and theme tags, while the second stage generates corresponding detailed answers for each question. - -**Input Parameters:** - -- `__init__()` - - `llm_serving`: Large language model interface object to use (required, must implement LLMServingABC interface) - - `num_samples`: Total number of samples to generate (default: 15, recommended to be less than 5000 to ensure data quality) -- `run()` - - `storage`: Storage interface object (default: predefined value above) - -**Key Features:** - -- Two-stage generation process ensures question-answer quality -- Supports three difficulty levels of question generation -- Ensures content diversity based on predefined knowledge tree tags -- Automatically parses and formats generation results -- Intelligent error handling and logging - -**Generation Process:** - -1. **Question Generation Stage**: Generates three difficulty levels of questions based on randomly selected topic, domain, and theme -2. **Answer Generation Stage**: Generates corresponding detailed answers for each valid question -3. **Data Organization Stage**: Organizes questions and answers into standard SFT format - -**Output Format:** - -- DataFrame containing 'difficulty', 'instruction', and 'output' fields -- difficulty field identifies question difficulty level (Easy/Medium/Hard) - -**Usage Example:** - -```python -from dataflow.prompts.general_text import CondorPrompt - -condor_gen = CondorGenerator( - llm_serving=api_llm_serving, - num_samples=150 # Will generate approximately 150 Q&A pairs - ) -result_df = condor_gen.run( - storage=self.storage.step() - ) -``` - -**Important Notes:** - -- When generating more than 5000 samples, it is recommended to increase the number of tags in `dataflow.prompts.general_text.CondorPrompt` to improve data richness -- The operator automatically handles failed parsing responses to ensure output data validity - - -### 4. PromptedGenerator✨ - -**Function Description:** This operator generates data based on user-provided prompts, combining system prompts and input content to generate desired output text. It provides maximum flexibility, allowing users to fully customize generation logic and output formats. - -**Input Parameters:** - -- `__init__()` - - `llm_serving`: Large language model interface object to use (required, must implement LLMServingABC interface) - - `system_prompt`: System prompt defining model behavior (default: "You are a helpful agent.") -- `run()` - - `storage`: Storage interface object (default: predefined value above) - - `input_key`: Input content field name (default: "raw_content") - - `output_key`: Output generated content field name (default: "generated_content") - -**Key Features:** - -- Fully customizable prompt control -- Flexible input-output field configuration -- Supports arbitrary format text generation tasks -- Simple and direct combination of system prompt and input content -- Batch processing capability - -**Working Principle:** - -1. Directly concatenates system prompt with input content -2. Calls LLM to generate corresponding output content -3. Adds generation results to specified output field - -**Usage Example:** - -```python -prompted_gen = PromptedGenerator( - llm_serving=api_llm_serving, - system_prompt="You are a professional document summarizer. Please generate a concise summary for the following content:" - ) -result_key = prompted_gen.run( - storage=self.storage.step(), - input_key="raw_content", - output_key="summary" - ) -``` - -### 5. ConsistentChatGenerator ✨ - -**Description:** -This operator synthesizes multi-turn dialogue data from scratch using a two-stage process based on predefined topics and user intents. In the first stage, it generates user queries under a specific topic and intent; in the second stage, it produces assistant replies for each query. It is ideal for constructing large-scale dialogue datasets with strong consistency and clearly defined categories. - -**Input Parameters:** - -- `__init__()` - - `llm_serving`: An instance of an LLM interface implementing the `LLMServingABC` protocol (required) - - `num_dialogs_per_intent`: Number of dialogues to generate per intent (default: 20, recommended ≤ 1000) - - `num_turns_per_dialog`: Number of turns per dialogue (default: 6) - - `temperature`: Sampling temperature controlling generation randomness (default: 0.9) - -- `run()` - - `storage`: The storage interface object (default: uses predefined context) - -**Key Features:** - -- Predefined combinations of topics and intents, covering multiple domains -- Two-stage generation: user queries first, assistant responses second -- Auto-cleaning of malformed or invalid generations -- Supports large-scale synthesis (recommended < 9000 dialogues; extend topic tags for more) -- Generates standardized multi-turn dialogue format compatible with SFT training - -**Output Format:** - -- A DataFrame with `category` and `conversation` fields -- The `conversation` field is a list of multi-turn Q&A items. Each turn follows the structure: - ```json - [ - {"role": "user", "value": "question"}, - {"role": "assistant", "value": "answer"}, - ... - ] - -**Usage Example:** -```python -from dataflow.operators.general_text import ConsistentChatGenerator - -consistent_gen = ConsistentChatGenerator( - llm_serving=api_llm_serving, - num_dialogs_per_intent=30, - num_turns_per_dialog=4, - temperature=0.85 -) - -result_df = consistent_gen.run( - storage=self.storage.step() -) -``` - -**Notes:** - -When generating more than 9000 dialogues, it is recommended to expand the topic_dict in ConsistentChatPrompt to improve the diversity and coverage of the generated conversations. To ensure high-quality output, the operator automatically skips any malformed or unparseable generations, maintaining a consistent and reliable dialogue structure. During multi-turn conversation generation, the operator invokes the LLM API twice for each dialogue (once for user questions and once for assistant responses), so a stable and responsive LLM service is essential. \ No newline at end of file diff --git a/docs/en/notes/guide/general_operators/text_process_operators.md b/docs/en/notes/guide/general_operators/text_process_operators.md deleted file mode 100644 index 8ff29ee83..000000000 --- a/docs/en/notes/guide/general_operators/text_process_operators.md +++ /dev/null @@ -1,252 +0,0 @@ ---- -title: General Data Processing Operators - -createTime: 2025/06/09 11:43:25 -permalink: /en/guide/text_process_operators/ ---- - -# Text Data Processing - -## Overview - -DataFlow currently supports text data processing at the data point level, categorized into three types: refiners, deduplicators and filters. - - - - - - - - - - - - - - - - - - - - - - - - - - -
TypeCountDescription
Refiners16Improves the content of data points through processing and augmentation without altering the total count.
Deduplicators6Removes duplicate data points using methods such as hashing.
Filters42Filters data points based on thresholds and other criteria.
- -## Refiners - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameApplicable TypeDescriptionRepository or Paper
CondorRefinerSFTGenerate evaluations and rewrites of SFT responses using LLM APIs to improve QA qualitypaper
LowercaseRefinerNLPConverts text fields to lowercase.-
PIIAnonymizeRefinerPre-trainingAnonymizes Personally Identifiable Information (PII), such as names and locations, to protect privacy.Code
RemovePunctuationRefinerNLPRemoves punctuation from text.-
RemoveNumberRefinerNLPRemoves numeric characters from text.-
RemoveExtraSpacesRefinerNLP, Pre-trainingReplaces multiple consecutive spaces with a single space and trims leading/trailing spaces.-
RemoveRepetitionsPunctuationRefinerNLPRemoves repeated punctuation, e.g., "!!!" becomes "!".-
RemoveEmojiRefinerPre-trainingRemoves emojis from text, e.g., "😀".Code
RemoveEmoticonsRefinerPre-trainingRemoves emoticons such as ":-)", using a predefined list.Code
RemoveContractionsRefinerNLPExpands contractions in text, e.g., "can't" becomes "cannot".Code
HtmlUrlRemoverRefinerPre-trainingRemoves URLs and HTML tags from text.-
TextNormalizationRefinerNLPNormalizes formats for dates, currencies, etc., in text.-
NERRefinerNLPUses Named Entity Recognition (NER) to identify and mask specific entities in text.Code
StemmingLemmatizationRefinerNLPPerforms stemming or lemmatization on text.Code
SpellingCorrectionRefinerNLP, Pre-trainingCorrects spelling errors in text using SymSpell.Code
RemoveStopwordsRefinerNLPRemoves stopwords (e.g., "the", "is") from text.Code
- -## Deduplicators - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeDescriptionRepository or Paper
HashDeduplicatorExact DeduplicationUses various hash functions (e.g., MD5, SHA256, XXH3_128) to remove duplicate data based on exact hash value comparison. Suitable for small-scale simple deduplication.-
CCNetDeduplicatorExact DeduplicationCompares the first 64 bits of the SHA-1 hash to identify duplicate text, balancing security and computational efficiency.-
NgramHashDeduplicatorNear DeduplicationCombines n-gram techniques with hashing to detect duplicates based on multiple hash comparisons of n-gram segments. Useful for identifying near-duplicates.Paper
SemDeduplicatorNear DeduplicationUses semantic similarity based on BERT embeddings and cosine similarity to detect duplicates. Ideal for detecting semantically similar but differently phrased text.Paper
Code
SimHashDeduplicatorNear DeduplicationUses the SimHash algorithm to detect similar text based on Hamming distance of fingerprints. Efficient for large-scale data deduplication.Paper
MinHashDeduplicatorNear DeduplicationCombines MinHash and LSH to compare sets with minimal memory usage and computation cost, detecting similarity between sets.Paper
- -## Filters - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameApplicable TypeDescriptionRepository or Paper
GeneralFilterAny DataFrameSupports flexible filtering of the DataFrame using one or more custom lambda functions -
LanguageFilterPre-training, SFTFilters specific languages using the fasttext language identification model.Huggingface
BlocklistFilterPre-training, SFTFilters data points using a blocklist (e.g., List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words).Code
- -Additionally, Open-DataFlow-Eval supports filtering data points based on scores from single data point scorers, with 18 supported scorers. - -```yaml -DeitaQualityFilter: - min_score: 1 - max_score: 5 - scorer_args: - device: 'cuda:0' - model_name: 'hkust-nlp/deita-quality-scorer' - max_length: 512 -``` -You can set min/max scores and scorer parameters in `scorer_args` for filtering. For more information on supported scorers, refer to the [evaluation algorithm documentation](/en/guide/text_evaluation_operators/) (excluding the Diversity part). - -In addition, heuristic rule filtering plays a significant role in the screening of pre-training data. In this regard, the [Dingo Data Quality Evaluation Tool](https://github.com/DataEval/dingo) has greatly inspired our development. We have integrated some of the rule filtering algorithms used in Dingo, a total of 22 types, into `dataflow/operators/filter/GeneralText/heuristics.py`. For details, please refer to the [Rules Documentation](https://github.com/DataEval/dingo/blob/dev/docs/rules.md). The names of the filters can be found in the `dataflow/operators/filter/GeneralText/heuristics.py` file. - - -All 42 data filters mentioned above share the same `yaml` invocation method. diff --git a/docs/zh/notes/guide/general_operators/text_evaluation_operators.md b/docs/zh/notes/guide/general_operators/text_evaluation_operators.md deleted file mode 100644 index ab8bad6a9..000000000 --- a/docs/zh/notes/guide/general_operators/text_evaluation_operators.md +++ /dev/null @@ -1,1567 +0,0 @@ ---- -title: 通用文本数据评估算子 -createTime: 2025/06/09 11:43:42 -permalink: /zh/guide/text_evaluation_operators/ ---- - - - -# 文本数据评估指标 -## 文本质量评估 -打分器分为以下四种类型,每种打分器会给出一个或多个分数。 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
类型数量描述
APIcaller3调用API打分
Diversity2计算整个数据集的多样性得分
Models12基于模型、分类器打分
Statistics3统计学指标打分
- -关于数据类型:【文本】表示接受单一字段字符串输入,可适用于预训练或微调数据。【指令】表示仅适用于微调数据多字段格式输入。 - -开源的算子种类是十分受限的,为了获得更好的数据质量,填补开源缺失的数据评估方法,我们精心设计并**自研**了新的算子集,其标记含义如下: - -- 🚀 **自主创新**:核心算法原创研发,填补现有算法空白或是进一步提升性能,突破当下性能瓶颈。 -- ✨ **开源首发**:首次将该算子集成到社区主流框架中,方便更多开发者使用,实现开源共享。 - - -### 打分器列表 - -#### APIcaller - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
名称评估维度数据类型简介取值范围官方仓库或论文
AlpagasusScorer✨内容准确性与有效性指令通过调用 GPT 评估指令的质量,返回一个质量得分,得分越高表明指令的质量越高。[0, 5]paper
PerspectiveScorer✨安全性文本使用 PerspectiveAPI 评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。[0, 1]API
TreeinstructScore✨多样性与复杂性指令通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。-paper
- -#### Diversity - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
名称评估维度数据类型简介取值范围官方仓库或论文
Task2VecScorer✨多样性与复杂性文本评估数据集的多样性,使用 Task2Vec 方法,高分表示数据集具有较高的多样性。[0.0525±3.41E-4, 0.4037±1.932E-5]paper
code
VendiScorer多样性与复杂性文本通过计算 VendiScore 来评估数据集的多样性,得分越高表示多样性越高。-paper
code
- -#### Models - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
名称评估维度数据类型简介取值范围官方仓库或论文
DebertaV3Scorer✨内容准确性与有效性文本基于 Nvidia Deberta V3 模型的质量分类器,用于评估文本质量。{Low, Medium, High}code
FineWebEduScorer✨教育价值文本用于评估文本教育价值的分类器,高分表示文本具有较高的教育价值。[0, 5]paper
code
InstagScorer✨多样性与复杂性指令通过返回标签的数量来评估指令的内容多样性,标签越多表示内容多样性越大。-paper
code
PerplexityScorer流畅性与可理解性文本基于 Kenlm 模型计算文本的困惑度,困惑度越低,文本的流畅性和可理解性越高。-paper
code
QuratingScorer✨内容准确性与有效性、教育价值文本通过 Qurating 模型评估文本的质量,得分越高表示质量越高。-paper
code
PairQualScorer🚀教育价值文本通过 PairQual 模型评估文本的质量,基于bge模型,支持中英双语,使用gpt对文本成对比较打分后训练而成。得分越高表示质量越高。-
code
PresidioScorer✨安全性文本使用Microsoft Presidio模型,识别文本中的私人实体(PII)如信用卡号、姓名、位置等。打分器返回PII信息个数。-code
SuperfilteringScorer✨流畅性与可理解性指令使用 Superfiltering 方法评估指令的跟随难度,得分越高表示指令越难跟随。-paper
code
TextbookScorer✨教育价值文本基于 FastText 分类器的课本质量分类器,用于评估文本的教育价值。[0, 2]paper
code
DeitaQualityScorer✨内容准确性与有效性指令基于 Llama 模型的 Deita 指令质量评估器,高分表示指令质量较高。[1,6]paper
code
DeitaComplexityScorer✨多样性与复杂性指令基于 Llama 模型的 Deita 指令复杂性评估器,高分表示指令复杂性较高。[1,6]paper
code
RMScorer✨流畅性与可理解性指令基于人类价值判断的奖励模型reward-model-deberta-v3-large-v2质量评分器。高分代表质量较高。-code
- -#### Statistics - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
名称评估维度数据类型简介取值范围官方仓库或论文
LangkitScorer文本结构, 流畅性与可理解性文本使用Langkit工具包计算文本的统计信息,如字数、句子数、音节数等,帮助评估文本的结构复杂性和可读性。-code
LexicalDiversityScorer✨多样性与复杂性文本使用MTLD和HDD方法计算词汇多样性评分,高分代表更丰富的词汇使用,反映文本的多样性和复杂性。-paper
code
NgramScorer多样性与复杂性文本计算文本中n-gram的重复比例,用以衡量文本的重复度,得分越高表示文本中重复的n-gram比例越低。[0, 1]-
- -### 质量评估体系 - -为提供更精准的数据质量评估,我们根据现有的分类器构架了一套质量评估体系。具体到每个打分器的输出分数指标,包括以下6个维度。 - -#### 1. 文本结构 (Text Structure) -- **LangkitScorer**: LangkitSentenceCountScore, LangkitCharacterCountScore, LangkitLetterCountScore, LangkitSyllableCountScore, LangkitPolysyllableCountScore, LangkitMonosyllableCountScore, LangkitLexiconCountScore, LangkitDifficultWordsScore - -#### 2. 多样性与复杂性 (Diversity & Complexity) -- **LexicalDiversityScorer**: LexicalDiversityMTLDScore, LexicalDiversityHD-DScore -- **NgramScorer**: NgramScore -- **InstagScorer**: InstagScore -- **TreeinstructScorer**: TreeinstructScore -- **Task2VecScorer**: Task2VecDiversityScore (ConfidenceInterval) -- **VendiScorer**: N-gramsVendiScore, BERTVendiScore, SimCSEVendiScore -- **DeitaComplexityScorer:** DeitaComplexityScore - - -#### 3. 流畅性与可理解性 (Fluency & Understandability) -- **LangkitScorer**: LangkitFleschReadingEaseScore, LangkitAutomatedReadabilityIndexScore, LangkitAggregateReadingLevelScore -- **PerplexityScorer**: PerplexityScore -- **QuratingScorer**: QuratingWritingStyleScore -- **SuperfilteringScorer**: SuperfilteringScore -- **RMScorer**: RMScore - -#### 4. 安全性 (Safety) -- **PerspectiveScorer**: PerspectiveScore -- **PresidioScorer**: PresidioScore - -#### 5. 教育价值 (Educational Value) -- **TextbookScorer**: TextbookScore -- **FineWebEduScorer**: FineWebEduScore -- **QuratingScorer**: QuratingEducationalValueScore -- **PairQualScorer**: PairQualScore - -#### 6. 内容准确性与有效性 (Content Accuracy & Effectiveness) -- **QuratingScorer**: QuratingRequiredExpertiseScore, QuratingFactsAndTriviaScore -- **DebertaV3Scorer**: DebertaV3Score -- **AlpagasusScorer**: AlpagasusScore -- **DeitaQualityScorer**: DeitaQualityScore - -### 基准值 - -为更好的提供数据质量参考,我们根据数据类型从目前认为较高质量的[Fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)和[alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned)数据集中分别随机选取了5k条数据,并测试了部分打分器的基准值。 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
打分器名称分数指标名称简介均值方差最大值最小值
PerspectiveScorerPerspectiveScore评估文本的毒性,是否含有潜在的侮辱性或不当言论。分数越高毒性越大。0.04260.00250.26100.0026
LexicalDiversityScorerLexicalDiversityMTLDScore测量文本的词汇多样性。分数越高词汇多样性越大。100.59901625.13181165.716414.8439
LexicalDiversityHD-DScore用于衡量文本的词汇多样性,基于离散分布计算。分数越高词汇多样性越大。0.84870.00140.98730.5570
NgramScorerNgramScore计算文本中n-gram的重复比例,用以衡量文本的重复度。分数越高N-gram重复性越低。0.99380.00021.00.8285
LangkitScorerLangkitFleschReadingEaseScore衡量文本的Flesch可读性。得分越高表示越易读。55.1870324.8975106.37-144.75
LangkitAutomatedReadabilityIndexScore自动可读性指标,基于句子长度和词汇难度。得分越高表示越难读。11.772719.411798.20.9
LangkitAggregateReadingLevelScore综合文本的阅读难度评分。得分越高表示越难读。11.233213.681677.00.0
LangkitSyllableCountScore统计文本中音节的总数。得分越高音节数量越大。815.38522299853.72724323732
LangkitLexiconCountScore统计文本中词汇的总数。得分越高词汇数量越大。524.1781061058.58753303323
LangkitSentenceCountScore统计文本中的句子数量。得分越高句子数量越大。28.96643618.254921931
LangkitCharacterCountScore统计文本中的字符数量。得分越高字符数量越大。2610.246223580442.8820139807118
LangkitLetterCountScore统计文本中的字母数量。得分越高字母数量越大。2513.457221890120.2030134507109
LangkitPolysyllableCountScore统计多音节单词的数量。得分越高多音节词数量越大。78.883418918.199032610
LangkitMonosyllableCountScore统计单音节单词的数量,通常与文本的简易度相关。得分越高单音节词数量越大。334.6674503285.51602513313
LangkitDifficultWordsScore统计文本中难词的数量。得分越高难词数量越大。93.411214401.278923664
TextbookScorerTextbookScore测试文本是否符合教科书标准。得分越高文本越接近理想教材。0.92550.17791.98670.0001
FineWebEduScorerFineWebEduScore测量文本的教育价值。得分越高文本教育价值越大。1.19010.49244.6827-0.6319
DebertaV3ScorerDebertaV3Score使用DebertaV3模型进行的文本评估。评估质量得分按高、中、低分类。Medium: 3180 次-High: 1412 次Low: 408 次
PerplexityScorerPerplexityScore衡量文本的困惑度。得分越高模型困惑度越大。564.3942165893.55428271.013.9
QuratingScorerQuratingWritingStyleScore评估文本的写作风格是否良好。得分越高文本写作风格越好。0.64536.79498.375-7.3474
QuratingRequiredExpertiseScore衡量文本需要的专业知识水平。得分越高文本越需要专业知识。-0.46617.04589.0-8.25
QuratingFactsAndTriviaScore测试文本是否包含事实和趣闻。得分越高文本包含的事实和趣闻越多。0.18894.56787.4688-6.0993
QuratingEducationalValueScore衡量文本的教育价值。得分越高文本教育价值越大。1.294611.219611.5625-8.7843
InstagScorerInstagScore通过返回标签的数量来评估指令的内容多样性。得分越高内容多样性越大。2.3042.9396111
SuperfilteringScorerSuperfilteringScore使用 Superfiltering 方法评估指令的跟随难度。得分越高指令跟随难度越大。1.3223836.03021978.65340.0011
DeitaQualityScorerDeitaQualityScore基于 Llama 模型的 Deita 指令质量评估器。得分越高指令质量越好。3.56290.92475.53091.0840
DeitaComplexityScorerDeitaComplexityScore基于 Llama 模型的 Deita 指令复杂性评估器。得分越高指令复杂性越大。1.49360.20863.32071.0001
VendiScorerN-grams_VendiScore基于N-grams嵌入评估文本多样性得分。得分越高数据集多样性越大。1832.96---
BERT_VendiScore基于BERT嵌入评估文本多样性得分。得分越高数据集多样性越大。1.83---
SimCSE_VendiScore基于SimCSE嵌入计算文本多样性得分。得分越高数据集多样性越大。68.94---
Task2VecScorerTask2VecScore使用Task2Vec多样性系数评估数据集多样性。得分越高数据集多样性越大。0.0673---
AlpagasusScorerAlpagasusScore调用ChatGPT评估指令质量得分。得分越高指令质量越好。4.1720.21645.02.0
TreeinstructScorerTreeinstructScore调用ChatGPT评估指令语义复杂度。得分越高指令语义复杂度越高。6.4949.754063.00.0
PresidioScorerPresidioScore使用Presidio评估PII个数。得分越高文本含义PII信息越多。21.40082915.35421786.00.0
RMScorerRMScore使用基于人类价值的奖励模型评估SFT数据质量得分越高数据质量越高。3.15379.94618.6803-4.9680
- -## 算子接口调用说明 - -特别地,对于指定存储路径等或是调用模型的算子,我们提供了封装后的**模型接口**以及**存储对象接口**,可以通过以下方式为算子进行模型API参数预定义: - -```python -from dataflow.llmserving import APILLMServing_request - -api_llm_serving = APILLMServing_request( - api_url="your_api_url", - model_name="model_name", - max_workers=5 - ) -``` - -可以通过以下方式为算子进行存储参数预定义: - -```python -from dataflow.utils.storage import FileStorage - - self.storage = FileStorage( - first_entry_file_name="your_file_path", - cache_path="./cache", - file_name_prefix="dataflow_cache_step", - cache_type="jsonl", # jsonl, json, ... - ) -``` - -后文使用的`api_llm_serving`以及`self.storage`即为此处已定义的接口对象,完整调用示例可参考`test/test_text_evaluation.py`。 - -对于传参,算子对象的构造函数主要传递与算子配置相关的信息,配置后可以一配置多调用;而`X.run()`函数传递与IO相关的`key`信息,详细可见后文算子说明示例。 - - -## 详细算子说明 - -### APIcaller算子 - -#### 1. AlpagasusScorer✨ - -**功能描述:** 该算子通过调用GPT评估指令的质量,返回一个质量得分,得分越高表明指令的质量越高。基于Alpagasus方法,专门用于评估指令数据的质量和有效性。 - -**输入参数:** - -- `__init__()` - - `llm_serving`:使用的大语言模型接口对象(必需,需实现LLMServingABC接口) - - `dimension`:评估维度(默认:"quality") -- `run()` - - `storage`:存储接口对象 - - `input_instruction_key`:指令字段名 - - `input_input_key`:输入文本字段名 - - `input_output_key`:输出文本字段名 - - `output_key`:输出得分字段名(默认:"AlpagasusScore") - -**主要特性:** - -- 基于GPT的智能质量评估 -- 支持自定义评估维度 -- 自动解析评分结果 -- 适用于指令微调数据质量评估 - -**使用示例:** - -```python -alpagasus_scorer = AlpagasusScorer( - llm_serving=api_llm_serving, - dimension="quality" - ) -alpagasus_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - input_input_key="input", - input_output_key="output", - output_key="AlpagasusScore" - ) -``` - -#### 2. PerspectiveScorer✨ - -**功能描述:** 该算子使用PerspectiveAPI评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。专门用于检测文本中的有害内容和不当言论。 - -**输入参数:** - -- `__init__()` - - `serving`:Perspective API服务对象 -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - - `output_key`:输出得分字段名(默认:"PerspectiveScore") - -**主要特性:** - -- 基于Google Perspective API的毒性检测 -- 自动处理文本长度限制(最大20KB) -- 支持批量处理 -- 返回0-1范围的毒性概率 - -**使用示例:** - -```python -perspective_scorer = PerspectiveScorer(serving=perspective_api_serving) -perspective_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="PerspectiveScore" - ) -``` - -#### 3. TreeinstructScore✨ - -**功能描述:** 该算子通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。基于语法分析的方法评估指令的结构复杂度。 - -**输入参数:** - -- `__init__()` - - 无需特殊参数 -- `run()` - - `storage`:存储接口对象 - - `input_instruction_key`:指令字段名 - - `output_key`:输出得分字段名(默认:"TreeinstructScore") - -**主要特性:** - -- 基于语法树分析的复杂度评估 -- 自动解析指令语法结构 -- 量化指令复杂性 -- 适用于指令多样性分析 - -**使用示例:** - -```python -treeinstruct_scorer = TreeinstructScore() -treeinstruct_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - output_key="TreeinstructScore" - ) -``` - - -### Diversity算子 - -#### 1. Task2VecScorer✨ - -**功能描述:** 该算子评估数据集的多样性,使用Task2Vec方法,高分表示数据集具有较高的多样性。基于任务嵌入的方法来计算数据集间的相似性和多样性。 - -**输入参数:** - -- `__init__()` - - 无需特殊参数 -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - -**主要特性:** - -- 基于Task2Vec方法的多样性评估 -- 计算置信区间 -- 适用于任务级别的多样性分析 -- 开源首发算法 - -**使用示例:** - -```python -task2vec_scorer = Task2VecScorer() -result = task2vec_scorer.run( - storage=self.storage.step(), - input_key="text" - ) -``` - -#### 2. VendiScorer - -**功能描述:** 该算子通过计算VendiScore来评估数据集的多样性,使用BERT和SimCSE模型生成嵌入并计算分数。VendiScore是一种基于核矩阵特征值的多样性度量方法,能够有效评估数据集的丰富性和覆盖范围。 - -**输入参数:** - -- `__init__()` - - `device`:计算设备(默认:"cuda") -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - -**主要特性:** - -- 多模型评估:同时使用BERT、SimCSE和N-gram方法 -- 基于嵌入的多样性计算 -- 适用于整个数据集的多样性评估 -- 支持GPU加速计算 - -**输出格式:** - -- `N-gramsVendiScore`:基于N-gram的多样性得分 -- `BERTVendiScore`:基于BERT的多样性得分 -- `SimCSEVendiScore`:基于SimCSE的多样性得分 - -**使用示例:** - -```python -vendi_scorer = VendiScorer(device="cuda") -result = vendi_scorer.run( - storage=self.storage.step(), - input_key="text" - ) -``` - - -### Models算子 - - -#### 1. DebertaV3Scorer✨ - -**功能描述:** 基于Nvidia Deberta V3模型的质量分类器,用于评估文本质量。该算子将文本分类为高(High)、中(Medium)、低(Low)三个质量等级,适用于大规模文本质量筛选。 - -**输入参数:** - -- `__init__()` - - `device`:计算设备(默认:"cuda") - - `model_cache_dir`:模型缓存目录(默认:"./dataflow_cache") - - `batch_size`:批处理大小(默认:32) -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - - `output_key`:输出得分字段名(默认:"DebertaV3Score") - -**主要特性:** - -- 基于DeBERTa-v3-large模型的高精度文本质量分类 -- 三级质量分类:High、Medium、Low -- 支持批量处理,提高处理效率 -- GPU加速计算 -- 适用于多种文本类型的质量评估 - -**评估维度:** 内容准确性与有效性 - -**数据类型:** 文本 - -**取值范围:** \{Low, Medium, High\} - -**使用示例:** - -```python -deberta_scorer = DebertaV3Scorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=32 -) -deberta_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="DebertaV3Score" -) -``` - -#### 2. FineWebEduScorer✨ - -**功能描述:** 用于评估文本教育价值的分类器,基于FineWeb-Edu数据集训练。该算子能够识别具有教育意义的文本内容,为教育资源筛选和课程内容开发提供支持。 - -**输入参数:** - -- `__init__()` - - `device`:计算设备(默认:"cuda") - - `model_cache_dir`:模型缓存目录(默认:"./dataflow_cache") - - `batch_size`:批处理大小(默认:32) -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - - `output_key`:输出得分字段名(默认:"FineWebEduScore") - -**主要特性:** - -- 专门针对教育价值评估设计 -- 基于大规模教育文本数据训练 -- 0-5分的细粒度评分 -- 支持多语言文本评估 -- 高效的批量处理能力 - -**评估维度:** 教育价值 - -**数据类型:** 文本 - -**取值范围:** [0, 5] - -**使用示例:** - -```python -fineweb_edu_scorer = FineWebEduScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=32 -) -fineweb_edu_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="FineWebEduScore" -) -``` - -#### 3. InstagScorer✨ - -**功能描述:** 通过返回标签的数量来评估指令的内容多样性,标签越多表示内容多样性越大。该算子基于InsTagger模型,能够自动识别指令中涉及的不同主题和任务类型。 - -**输入参数:** - -- `__init__()` - - `device`:计算设备(默认:"cuda") - - `model_cache_dir`:模型缓存目录(默认:"./dataflow_cache") - - `batch_size`:批处理大小(默认:16) -- `run()` - - `storage`:存储接口对象 - - `input_instruction_key`:指令字段名(默认:"instruction") - - `output_key`:输出得分字段名(默认:"InstagScore") - -**主要特性:** - -- 基于InsTagger模型的多标签分类 -- 自动识别指令涉及的任务类型和主题 -- 量化指令内容的多样性 -- 支持复杂指令的细粒度分析 -- 适用于指令数据集的多样性评估 - -**评估维度:** 多样性与复杂性 - -**数据类型:** 指令 - -**取值范围:** 正整数(标签数量) - -**使用示例:** - -```python -instag_scorer = InstagScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=16 -) -instag_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - output_key="InstagScore" -) -``` - -#### 4. PerplexityScorer - -**功能描述:** 基于Kenlm模型计算文本的困惑度,困惑度越低,文本的流畅性和可理解性越高。该算子使用统计语言模型评估文本的自然度和语言质量。 - -**输入参数:** - -- `__init__()` - - `model_path`:Kenlm模型路径(默认:预设模型路径) - - `language`:语言类型(默认:"en") -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - - `output_key`:输出得分字段名(默认:"PerplexityScore") - -**主要特性:** - -- 基于n-gram统计语言模型 -- 快速计算文本困惑度 -- 支持多种语言 -- 内存占用小,计算效率高 -- 适用于大规模文本流畅性评估 - -**评估维度:** 流畅性与可理解性 - -**数据类型:** 文本 - -**取值范围:** 正数(困惑度值,越小越好) - -**使用示例:** - -```python -perplexity_scorer = PerplexityScorer( - model_path="./models/kenlm_model.bin", - language="en" -) -perplexity_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="PerplexityScore" -) -``` - - -#### 5. QuratingScorer✨ - -**功能描述:** 通过Qurating模型评估文本的质量,得分越高表示质量越高。该算子基于多维度评估框架,能够从写作风格、教育价值、专业知识要求等多个角度评估文本质量。 - -**输入参数:** - -- `__init__()` - - `device`:计算设备(默认:"cuda") - - `model_cache_dir`:模型缓存目录(默认:"./dataflow_cache") - - `batch_size`:批处理大小(默认:16) - - `max_length`:最大序列长度(默认:512) -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - - `output_key`:输出得分字段名(默认:"QuratingScore") - -**主要特性:** - -- 多维度文本质量评估 -- 基于大规模高质量文本训练 -- 支持长文本处理 -- 提供细粒度的质量评分 -- 适用于学术和专业文本评估 - -**评估维度:** 内容准确性与有效性、教育价值 - -**数据类型:** 文本 - -**取值范围:** 连续数值(越高越好) - -**输出指标:** -- `QuratingWritingStyleScore`:写作风格评分 -- `QuratingEducationalValueScore`:教育价值评分 -- `QuratingRequiredExpertiseScore`:专业知识要求评分 -- `QuratingFactsAndTriviaScore`:事实和知识评分 - -**使用示例:** - -```python -qurating_scorer = QuratingScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=16, - max_length=512 -) -qurating_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="QuratingScore" -) -``` - -#### 6. PairQualScorer🚀 - -**功能描述:** 通过PairQual模型评估文本的质量,基于bge模型,支持中英双语,使用GPT对文本成对比较打分后训练而成。这是一个自主创新的算子,专门针对中英文文本质量评估进行了优化。 - -**输入参数:** - -- `__init__()` - - `device`:计算设备(默认:"cuda") - - `model_cache_dir`:模型缓存目录(默认:"./dataflow_cache") - - `batch_size`:批处理大小(默认:32) - - `language`:语言类型(默认:"auto",自动检测) -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - - `output_key`:输出得分字段名(默认:"PairQualScore") - -**主要特性:** - -- 基于BGE模型的双语质量评估 -- 使用GPT成对比较数据训练 -- 支持中英文双语评估 -- 自主创新算法 -- 高精度的质量判断能力 - -**评估维度:** 教育价值 - -**数据类型:** 文本 - -**取值范围:** 连续数值(越高越好) - -**使用示例:** - -```python -pairqual_scorer = PairQualScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=32, - language="auto" -) -pairqual_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="PairQualScore" -) -``` - -#### 7. PresidioScorer✨ - -**功能描述:** 使用Microsoft Presidio模型,识别文本中的私人实体(PII)如信用卡号、姓名、位置等。打分器返回PII信息个数,用于评估文本的隐私安全性。 - -**输入参数:** - -- `__init__()` - - `language`:语言类型(默认:"en") - - `entities`:要检测的实体类型列表(默认:["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "CREDIT_CARD", "LOCATION"]) -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - - `output_key`:输出得分字段名(默认:"PresidioScore") - -**主要特性:** - -- 基于Microsoft Presidio的PII检测 -- 支持多种个人信息类型识别 -- 可自定义检测的实体类型 -- 支持多语言文本处理 -- 高精度的隐私信息识别 - -**评估维度:** 安全性 - -**数据类型:** 文本 - -**取值范围:** 非负整数(PII实体数量) - -**检测的PII类型:** -- PERSON:人名 -- EMAIL_ADDRESS:邮箱地址 -- PHONE_NUMBER:电话号码 -- CREDIT_CARD:信用卡号 -- LOCATION:地理位置 -- 其他可配置类型 - -**使用示例:** - -```python -presidio_scorer = PresidioScorer( - language="en", - entities=["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "CREDIT_CARD", "LOCATION"] -) -presidio_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="PresidioScore" -) -``` - -#### 8. SuperfilteringScorer✨ - -**功能描述:** 使用Superfiltering方法评估指令的跟随难度,得分越高表示指令越难跟随。该算子基于指令复杂性分析,帮助识别需要高级推理能力的指令。 - -**输入参数:** - -- `__init__()` - - `device`:计算设备(默认:"cuda") - - `model_cache_dir`:模型缓存目录(默认:"./dataflow_cache") - - `batch_size`:批处理大小(默认:16) -- `run()` - - `storage`:存储接口对象 - - `input_instruction_key`:指令字段名(默认:"instruction") - - `input_output_key`:输出字段名(默认:"output") - - `output_key`:输出得分字段名(默认:"SuperfilteringScore") - -**主要特性:** - -- 基于Superfiltering方法的难度评估 -- 评估指令的跟随复杂度 -- 识别需要高级推理的指令 -- 支持指令-响应对分析 -- 适用于指令数据质量筛选 - -**评估维度:** 流畅性与可理解性 - -**数据类型:** 指令 - -**取值范围:** 连续数值(越高表示越难跟随) - -**使用示例:** - -```python -superfiltering_scorer = SuperfilteringScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=16 -) -superfiltering_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - input_output_key="output", - output_key="SuperfilteringScore" -) -``` - -#### 9. TextbookScorer✨ - -**功能描述:** 基于FastText分类器的课本质量分类器,用于评估文本的教育价值。该算子专门针对教育内容设计,能够识别具有课本质量的文本。 - -**输入参数:** - -- `__init__()` - - `model_path`:FastText模型路径(默认:预设模型路径) - - `threshold`:分类阈值(默认:0.5) -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - - `output_key`:输出得分字段名(默认:"TextbookScore") - -**主要特性:** - -- 基于FastText的高效文本分类 -- 专门针对教育内容优化 -- 快速的推理速度 -- 低内存占用 -- 适用于大规模教育文本筛选 - -**评估维度:** 教育价值 - -**数据类型:** 文本 - -**取值范围:** [0, 2] - -**分类标准:** -- 0:非教育内容 -- 1:一般教育内容 -- 2:高质量教育内容 - -**使用示例:** - -```python -textbook_scorer = TextbookScorer( - model_path="./models/textbook_classifier.bin", - threshold=0.5 -) -textbook_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="TextbookScore" -) -``` - -#### 10. DeitaQualityScorer✨ - -**功能描述:** 基于Llama模型的Deita指令质量评估器,高分表示指令质量较高。该算子通过生成1-6分的质量评分来评估指令质量,特别适用于指令微调数据的质量筛选。 - -**输入参数:** - -- `__init__()` - - `device`:计算设备(默认:"cuda") - - `model_cache_dir`:模型缓存目录(默认:"./dataflow_cache") - - `max_length`:最大序列长度(默认:512) - - `batch_size`:批处理大小(默认:8) -- `run()` - - `storage`:存储接口对象 - - `input_instruction_key`:指令文本字段名(默认:"instruction") - - `input_output_key`:输出文本字段名(默认:"output") - - `output_key`:输出得分字段名(默认:"DeitaQualityScore") - -**主要特性:** - -- 基于Llama模型的专业质量评估 -- 1-6分的细粒度评分 -- 使用softmax概率分布计算最终得分 -- 支持批量处理和GPU加速 -- 专门针对指令-响应对优化 - -**评估维度:** 内容准确性与有效性 - -**数据类型:** 指令 - -**取值范围:** [1, 6] - -**评分标准:** -- 1分:质量很差,指令不清晰或响应不相关 -- 2分:质量较差,存在明显问题 -- 3分:质量一般,基本可用但有改进空间 -- 4分:质量良好,指令清晰且响应合适 -- 5分:质量很好,高质量的指令-响应对 -- 6分:质量优秀,完美的指令-响应对 - -**使用示例:** - -```python -deita_quality_scorer = DeitaQualityScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - max_length=512, - batch_size=8 -) -deita_quality_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - input_output_key="output", - output_key="DeitaQualityScore" -) -``` - -#### 11. DeitaComplexityScorer✨ - -**功能描述:** 基于Llama模型的Deita指令复杂性评估器,高分表示指令复杂性较高。该算子评估指令的认知复杂度和执行难度,帮助识别具有挑战性的指令。 - -**输入参数:** - -- `__init__()` - - `device`:计算设备(默认:"cuda") - - `model_cache_dir`:模型缓存目录(默认:"./dataflow_cache") - - `max_length`:最大序列长度(默认:512) - - `batch_size`:批处理大小(默认:8) -- `run()` - - `storage`:存储接口对象 - - `input_instruction_key`:指令文本字段名(默认:"instruction") - - `input_output_key`:输出文本字段名(默认:"output") - - `output_key`:输出得分字段名(默认:"DeitaComplexityScore") - -**主要特性:** - -- 基于Llama模型的复杂性评估 -- 1-6分的复杂度评分 -- 评估指令的认知负荷 -- 识别需要高级推理的指令 -- 支持指令数据集的难度分层 - -**评估维度:** 多样性与复杂性 - -**数据类型:** 指令 - -**取值范围:** [1, 6] - -**复杂度标准:** -- 1分:非常简单,基础操作 -- 2分:简单,直接任务 -- 3分:中等,需要一定思考 -- 4分:复杂,需要多步推理 -- 5分:很复杂,需要高级推理 -- 6分:极其复杂,需要专业知识 - -**使用示例:** - -```python -deita_complexity_scorer = DeitaComplexityScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - max_length=512, - batch_size=8 -) -deita_complexity_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - input_output_key="output", - output_key="DeitaComplexityScore" -) -``` - -#### 12. RMScorer✨ - -**功能描述:** 基于人类价值判断的奖励模型reward-model-deberta-v3-large-v2质量评分器。高分代表质量较高。该算子使用经过人类反馈训练的奖励模型来评估文本质量。 - -**输入参数:** - -- `__init__()` - - `device`:计算设备(默认:"cuda") - - `model_cache_dir`:模型缓存目录(默认:"./dataflow_cache") - - `batch_size`:批处理大小(默认:16) - - `max_length`:最大序列长度(默认:512) -- `run()` - - `storage`:存储接口对象 - - `input_instruction_key`:指令字段名(默认:"instruction") - - `input_output_key`:输出字段名(默认:"output") - - `output_key`:输出得分字段名(默认:"RMScore") - -**主要特性:** - -- 基于人类反馈训练的奖励模型 -- 反映人类价值判断和偏好 -- 适用于对话和指令响应评估 -- 高精度的质量判断 -- 支持多轮对话评估 - -**评估维度:** 流畅性与可理解性 - -**数据类型:** 指令 - -**取值范围:** 连续数值(越高表示质量越好) - -**评估标准:** -- 考虑响应的有用性 -- 评估内容的安全性 -- 判断回答的准确性 -- 衡量表达的清晰度 - -**使用示例:** - -```python -rm_scorer = RMScorer( - device="cuda", - model_cache_dir="./dataflow_cache", - batch_size=16, - max_length=512 -) -rm_scorer.run( - storage=self.storage.step(), - input_instruction_key="instruction", - input_output_key="output", - output_key="RMScore" -) -``` - - -### Statistics算子 - -#### 1. LexicalDiversityScorer✨ - -**功能描述:** 该算子使用MTLD(词汇多样性测量)和HDD(移动平均类型-标记比)方法计算文本词汇多样性,评估文本的词汇丰富度和表达多样性。 - -**输入参数:** - -- `__init__()` - - 无需特殊参数 -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - -**主要特性:** - -- **MTLD方法**:通过计算维持特定TTR阈值所需的单词数量来评估词汇多样性 -- **HDD方法**:基于样本的词汇丰富度估计,使用超几何分布计算 -- 自动处理标点符号和大小写 -- 支持不同长度文本的适应性评估 - -**输入要求:** - -- MTLD评估:文本长度需大于50个单词 -- HDD评估:文本长度需在50-1000个单词之间 - -**输出格式:** - -- `LexicalDiversityMTLDScore`:MTLD多样性得分(值越高表示多样性越好) -- `LexicalDiversityHD-DScore`:HDD多样性得分(值越高表示多样性越好) - -**使用示例:** - -```python -lexical_scorer = LexicalDiversityScorer() -lexical_scorer.run( - storage=self.storage.step(), - input_key="text" - ) -``` - -#### 2. LangkitScorer - -**功能描述:** 该算子使用Langkit工具包计算文本的统计信息,如字数、句子数、音节数等,帮助评估文本的结构复杂性和可读性。 - -**输入参数:** - -- `__init__()` - - 无需特殊参数 -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - -**主要特性:** - -- 全面的文本统计分析 -- 多维度可读性评估 -- 包含Flesch可读性评分 -- 自动化可读性指标计算 - -**输出指标:** - -- 文本结构:句子数、字符数、字母数、词汇数 -- 复杂性:音节数、多音节词数、单音节词数、难词数 -- 可读性:Flesch可读性评分、自动可读性指标、综合阅读难度 - -**使用示例:** - -```python -langkit_scorer = LangkitScorer() -langkit_scorer.run( - storage=self.storage.step(), - input_key="text" - ) -``` - -#### 3. NgramScorer - -**功能描述:** 该算子计算文本中n-gram的重复比例,用以衡量文本的重复度,得分越高表示文本中重复的n-gram比例越低。 - -**输入参数:** - -- `__init__()` - - `n`:n-gram的长度(默认:3) -- `run()` - - `storage`:存储接口对象 - - `input_key`:输入文本字段名 - - `output_key`:输出得分字段名(默认:"NgramScore") - -**主要特性:** - -- 基于n-gram的重复度分析 -- 可配置n-gram长度 -- 量化文本多样性 -- 计算效率高 - -**使用示例:** - -```python -ngram_scorer = NgramScorer(n=3) -ngram_scorer.run( - storage=self.storage.step(), - input_key="text", - output_key="NgramScore" - ) -``` - -## 生成文本质量评估 - -Dataflow集成了三种生成文本质量评估方法,用于评估生成文本和参考文本之间的相似性。 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
打分器名称简介取值范围值解释
BLEU Scorer基于 n-gram 匹配的精确度计算,将生成文本中的 n-gram 与参考文本中的 n-gram 进行匹配并计算精确度[0, 1]值越大,表示生成文本与参考文本的匹配程度越高
CIDEr Scorer利用 TF-IDF 加权的 n-gram 统计,将生成文本的描述与参考描述进行相似性比较[0, 1]值越大,表示生成文本与参考文本在内容上越一致
BertScorer使用 Bert 模型计算生成文本与参考文本的词向量相似性,输出精确度、召回率和 F1 分数[0, 1]值越大,表示生成文本与参考文本在语义上越相似
diff --git a/docs/zh/notes/guide/general_operators/text_generate_operators.md b/docs/zh/notes/guide/general_operators/text_generate_operators.md deleted file mode 100644 index 4e420c5f9..000000000 --- a/docs/zh/notes/guide/general_operators/text_generate_operators.md +++ /dev/null @@ -1,312 +0,0 @@ ---- -title: 通用文本数据合成算子 -createTime: 2025/06/24 21:49:55 -permalink: /zh/guide/lo3cyadt/ ---- - -# 文本数据合成 - -目前Dataflow集成了五种基础文本数据合成器,涉及预训练文档数据、SFT格式数据、多轮对话等不同格式。 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
名称适用类型简介官方仓库或论文
PretrainGenerator预训练使用预训练文档数据合成类phi-4问答数据对,使用QA格式复述文档Paper
SFTGeneratorSeedSFT根据种子文档合成SFT格式QA数据对,并返回原文信息-
CondorGeneratorSFT根据预置知识树标签,两阶段从0合成SFT格式数据(合成数量大于5000时建议增加标签数量)paper
PromptedGenerator-根据用户自定义prompt进行数据生成-
ConsistentChatGenerator多轮对话根据预置主题和人类意图,两阶段从0合成多轮对话格式数据(合成数量大于9000时建议增加标签数量)paper
- -## 算子接口调用说明 - -特别地,对于指定存储路径等或是调用模型的算子,我们提供了封装后的**模型接口**以及**存储对象接口**,可以通过以下方式为算子进行模型API参数预定义: - -```python -from dataflow.llmserving import APILLMServing_request - -api_llm_serving = APILLMServing_request( - api_url="your_api_url", - model_name="model_name", - max_workers=5 - ) -``` -可以通过以下方式为算子进行存储参数预定义: - -```python -from dataflow.utils.storage import FileStorage - - self.storage = FileStorage( - first_entry_file_name="your_file_path", - cache_path="./cache", - file_name_prefix="dataflow_cache_step", - cache_type="jsonl", # jsonl, json, ... - ) -``` - -后文使用的`api_llm_serving`以及`self.storage`即为此处已定义的接口对象,完整调用示例可参考`test/test_general_text.py`。 - -对于传参,算子对象的构造函数主要传递与算子配置相关的信息,配置后可以一配置多调用;而`X.run()`函数传递与IO相关的`key`信息,详细可见后文算子说明示例。 - - -## 详细算子说明 - -### 1. PretrainGenerator✨ - -**功能描述:** 该算子专门用于基于给定文档内容,生成预训练格式的多轮对话问答数据。将原始文档内容转换为适合语言模型预训练的对话格式数据,通过调用大语言模型进行文档内容的重新组织和表达。 - -**输入参数:** - -- `__init__()` - - `llm_serving`:使用的大语言模型接口对象(必需,需实现LLMServingABC接口) -- `run()` - - `storage`:存储接口对象(默认:前文预设值) - - `input_key`:输入文档内容字段名(默认:"raw_content") - - `output_key`:输出生成内容字段名(默认:"generated_content") - -**主要特性:** - -- 支持多种文档格式的内容转换 -- 自动生成适合预训练的对话格式数据 -- 保持原始文档的核心信息完整性 -- 支持批量处理大规模文档数据 - -**使用示例:** - -```python -from dataflow.prompts.general_text import PretrainGeneratorPrompt - -pretrain_gen = PretrainGenerator( - llm_serving=api_llm_serving - ) -result = pretrain_gen.run( - storage=self.storage.step(), - input_key="raw_content", - output_key="generated_content" - ) -``` - - -### 2. SFTGeneratorSeed✨ - -**功能描述:** 该算子基于给定文档内容,生成监督微调格式的问答数据,并支持用户自定义生成内容要求。从原始文档中提取信息,生成符合SFT格式的指令-响应对,特别适用于构建高质量的监督微调数据集。 - -**输入参数:** - -- `__init__()` - - `llm_serving`:使用的大语言模型接口对象(必需,需实现LLMServingABC接口) - - `custom_prompt`:用户自定义提示词(必需,定义生成内容的具体要求) -- `run()` - - `storage`:存储接口对象(默认:前文预设值) - - `input_key`:输入文档内容字段名(默认:"raw_content") - -**主要特性:** - -- 支持用户自定义生成内容要求 -- 自动提取和解析JSON格式的指令-响应对 -- 保留原始文档内容用于追溯 -- 智能过滤无效生成结果 -- 支持最大4096 tokens的长文本生成 - -**输出格式:** - -- 包含'instruction'、'output'和'raw_content'字段的DataFrame -- 返回包含'instruction'和'output'字段名的列表 - -**使用示例:** - -```python -from dataflow.prompts.general_text import SFTGeneratorSeedPrompt - -sft_gen = SFTGeneratorSeed( - llm_serving=api_llm_serving, - custom_prompt="请基于文档内容生成教学问答对" - ) -result_keys = sft_gen.run( - storage=self.storage.step(), - input_key="raw_content" - ) -``` - - -### 3. CondorGenerator✨🚀 - -**功能描述:** 该算子基于预置知识树标签,通过两阶段流程从零合成SFT格式数据。第一阶段根据随机选择的主题、领域和主题标签生成不同难度级别(Easy、Medium、Hard)的问题,第二阶段为每个问题生成对应的详细答案。 - -**输入参数:** - -- `__init__()` - - `llm_serving`:使用的大语言模型接口对象(必需,需实现LLMServingABC接口) - - `num_samples`:生成样本总数(默认:15,建议小于5000以保证数据质量) -- `run()` - - `storage`:存储接口对象(默认:前文预设值) - -**主要特性:** - -- 两阶段生成流程确保问答质量 -- 支持三个难度级别的问题生成 -- 基于预置知识树标签保证内容多样性 -- 自动解析和格式化生成结果 -- 智能错误处理和日志记录 - -**生成流程:** - -1. **问题生成阶段**:根据随机选择的topic、domain、theme生成三个难度级别的问题 -2. **答案生成阶段**:为每个有效问题生成对应的详细答案 -3. **数据整理阶段**:将问题和答案组织成标准SFT格式 - -**输出格式:** - -- 包含'difficulty'、'instruction'和'output'字段的DataFrame -- difficulty字段标识问题难度级别(Easy/Medium/Hard) - -**使用示例:** - -```python -from dataflow.prompts.general_text import CondorPrompt - -condor_gen = CondorGenerator( - llm_serving=api_llm_serving, - num_samples=150 # 将生成约150个问答对 - ) -result_df = condor_gen.run( - storage=self.storage.step() - ) -``` - -**注意事项:** - -- 当生成数量大于5000时,建议在`dataflow.prompts.general_text.CondorPrompt`中增加标签数量以提高数据丰富性 -- 算子会自动处理解析失败的响应,确保输出数据的有效性 - - -### 4. PromptedGenerator✨ - -**功能描述:** 该算子基于用户提供的提示词(prompt)生成数据,结合系统提示词和输入内容生成符合要求的输出文本。提供了最大的灵活性,允许用户完全自定义生成逻辑和输出格式。 - -**输入参数:** - -- `__init__()` - - `llm_serving`:使用的大语言模型接口对象(必需,需实现LLMServingABC接口) - - `system_prompt`:系统提示词,定义模型行为(默认:"You are a helpful agent.") -- `run()` - - `storage`:存储接口对象(默认:前文预设值) - - `input_key`:输入内容字段名(默认:"raw_content") - - `output_key`:输出生成内容字段名(默认:"generated_content") - -**主要特性:** - -- 完全自定义的提示词控制 -- 灵活的输入输出字段配置 -- 支持任意格式的文本生成任务 -- 简单直接的系统提示词与输入内容组合 -- 批量处理能力 - -**工作原理:** - -1. 将系统提示词与输入内容直接拼接 -2. 调用LLM生成对应的输出内容 -3. 将生成结果添加到指定的输出字段 - -**使用示例:** - -```python -prompted_gen = PromptedGenerator( - llm_serving=api_llm_serving, - system_prompt="你是一个专业的文档摘要生成器,请为以下内容生成简洁的摘要:" - ) -result_key = prompted_gen.run( - storage=self.storage.step(), - input_key="raw_content", - output_key="summary" - ) -``` - -### 5. ConsistentChatGenerator ✨ - -**功能描述:** -该算子基于预置主题和人类意图,通过两阶段流程从零合成多轮对话格式数据。第一阶段生成特定主题和意图下的多轮用户提问,第二阶段为每轮问题生成对应回复。适用于构建一致性强、类别明确的大规模对话语料。 - -**输入参数:** - -- `__init__()` - - `llm_serving`:使用的大语言模型接口对象(必需,需实现 LLMServingABC 接口) - - `num_dialogs_per_intent`:每个意图生成的对话数量(默认:20,建议不超过1000) - - `num_turns_per_dialog`:每个对话包含的轮次数量(默认:6) - - `temperature`:生成温度,控制采样随机性(默认:0.9) - -- `run()` - - `storage`:存储接口对象(默认:前文预设值) - -**主要特性:** - -- 预置主题与意图组合,覆盖多领域场景 -- 两阶段生成:先生成用户问题,再生成对应回答 -- 自动清洗格式错误及无效结果 -- 支持大规模批量合成(推荐数量 < 9000,超过建议扩展主题标签) -- 生成标准的多轮对话格式,适配常见 SFT 训练任务 - -**输出格式:** - -- 包含 `category` 和 `conversation` 字段的 DataFrame -- `conversation` 字段为多轮问答组成的对话列表,每轮对话结构为: - ```json - [ - {"role": "user", "value": "问题"}, - {"role": "assistant", "value": "回答"}, - ... - ] - -**使用示例:** -```python -from dataflow.operators.general_text import ConsistentChatGenerator - -consistent_gen = ConsistentChatGenerator( - llm_serving=api_llm_serving, - num_dialogs_per_intent=30, - num_turns_per_dialog=4, - temperature=0.85 -) - -result_df = consistent_gen.run( - storage=self.storage.step() -) - -``` - -**注意事项:** - -当合成对话的数量超过 9000 条时,建议在 ConsistentChatPrompt 中扩展 topic_dict,以提升生成对话的多样性和覆盖范围。为了保证输出数据的质量,算子会自动跳过格式不合规或无法解析的生成结果,确保最终得到的对话结构清晰、内容合理。在生成多轮对话的过程中,该算子会对每条对话调用两次 LLM 接口(一次生成用户提问,一次生成助手回答),因此需要确保所使用的 LLM 服务稳定、响应迅速。 \ No newline at end of file diff --git a/docs/zh/notes/guide/general_operators/text_process_operators.md b/docs/zh/notes/guide/general_operators/text_process_operators.md deleted file mode 100644 index 752a932ae..000000000 --- a/docs/zh/notes/guide/general_operators/text_process_operators.md +++ /dev/null @@ -1,249 +0,0 @@ ---- -title: 通用文本数据处理算子 -createTime: 2025/06/09 11:43:42 -permalink: /zh/guide/q07ou7d9/ ---- - - -# 文本数据处理 -## 概览 -DataFlow目前支持的文本数据处理主要针对于数据点层面,可以分为以下三种类型,分别是数据改写器、数据去重器和数据过滤器。 - - - - - - - - - - - - - - - - - - - - - - - - - -
类型数量描述
数据改写器16通过数据处理、数据增强等方式改善数据点内容(不改变总数量)
数据去重器6通过哈希等方法进行数据点去重
数据过滤器42通过设置阈值等方式过滤数据点
- -## 数据改写器 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
名称适用类型简介官方仓库或论文
CondorRefinerSFT利用大模型API生成对SFT回复的评价并改写,提升QA对质量paper
LowercaseRefinerNLP将文本字段中的内容转换为小写-
PIIAnonymizeRefiner预训练通过识别和匿名化个人身份信息(PII),如姓名、位置等,来保护隐私Code
RemovePunctuationRefinerNLP移除文本中的标点符号-
RemoveNumberRefinerNLP移除文本中的数字字符-
RemoveExtraSpacesRefinerNLP、预训练移除文本中的多余空格,将连续的多个空格替换为单个空格,并去除文本前后空格-
RemoveRepetitionsPunctuationRefinerNLP移除重复的标点符号,例如“!!!”变为“!”-
RemoveEmojiRefiner预训练移除文本中的表情符号,例如"😀"Code
RemoveEmoticonsRefiner预训练移除文本中的表情符号,例如“:‑)”,使用预定义的表情符号列表Code
RemoveContractionsRefinerNLP扩展文本中的缩写词(例如将“can't”扩展为“cannot”)Code
HtmlUrlRemoverRefiner预训练移除文本中的URL和HTML标签-
TextNormalizationRefinerNLP规范化文本中的日期格式、货币格式等-
NERRefinerNLP使用命名实体识别(NER)技术识别并屏蔽文本中的特定实体Code
StemmingLemmatizationRefinerNLP对文本进行词干提取或词形还原Code
SpellingCorrectionRefinerNLP、预训练通过SymSpell对文本中的拼写错误进行纠正Code
RemoveStopwordsRefinerNLP移除文本中的停用词(如“the”,“is”)Code
- - -## 数据去重器 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
名称类别简介官方仓库或论文
HashDeduplicator精确去重使用多种哈希函数(如MD5、SHA256、XXH3_128)对文本进行哈希处理,通过精确的比较哈希值来识别和移除重复数据,适用于小规模简单去重场景。-
CCNetDeduplicator精确去重基于SHA-1哈希算法的前64位进行比较,以识别重复文本。旨在平衡哈希安全性和计算效率。-
NgramHashDeduplicator近似去重结合n-gram技术与哈希算法,将文本分割为多个n-gram片段并分别进行哈希处理。通过多个哈希值的比较来识别相似或重复的文本,适用于处理具有细微差异的重复数据。Paper
SemDeduplicator近似去重基于BERT模型的语义相似度计算,通过生成文本的嵌入向量并计算余弦相似度来识别重复内容。适用于需要语义理解的高级去重场景,能够识别语义上相似但表述不同的文本。Paper
Code
SimHashDeduplicator近似去重采用SimHash算法,通过生成文本的SimHash指纹并计算汉明距离来判断文本的相似度。适用于高效的相似文本检测,能够快速处理大规模数据集中的重复或相似文本。Paper
MinHashDeduplicator近似去重结合MinHash与LSH,通过将集合中的元素哈希成一个较小的签名(通常是一个固定长度的整数或比特串),从而以很小的内存占用和低计算成本比较两个集合之间的相似度。Paper
- -## 数据过滤器 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
名称适用类型简介官方仓库或论文
GeneralFilter任意Dataframe支持通过一/多个自定义lambda函数对 DataFrame 进行灵活过滤 -
LanguageFilter预训练、SFT使用fasttext语言识别模型过滤特定语言Huggingface
BlocklistFilter预训练、SFT设置阈值,根据List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words屏蔽词表过滤数据点Code
- -同时,作为配套工作,我们支持了Open-DataFlow-Eval文本数据评估模块中基于单个数据点打分器评分的过滤。共18种。 -```yaml -DeitaQualityFilter: - min_score: 1 - max_score: 5 - scorer_args: - device: 'cuda:0' - model_name: 'hkust-nlp/deita-quality-scorer' - max_length: 512 -``` -可通过设置需要保留的`min/max`分数并在`scorer_args`中设置打分器参数实现。 - -支持的打分器,详见[评估算法文档](/zh/guide/text_evaluation_operators/)(除Diversity部分)。 - -此外,启发式规则过滤在预训练数据的筛选方面占有很大的比重,在这一方面,[Dingo数据质量评估工具](https://github.com/DataEval/dingo)对我们的开发带来了很大的启发。我们在`dataflow/operators/filter/GeneralText/heuristics.py`中整合了部分Dingo中使用的规则过滤算法,共22种。详见[规则文档](https://github.com/DataEval/dingo/blob/dev/docs/rules.md),过滤器名称可参考`dataflow/operators/filter/GeneralText/heuristics.py`文件。 - -需要说明的是,以上提到的42种数据过滤器具有相同的`Yaml`调用方式。