From f003ac00724024276d421a1e0383a7861660d693 Mon Sep 17 00:00:00 2001 From: zzy1127 <1726073424@qq.com> Date: Sun, 1 Feb 2026 18:03:28 +0800 Subject: [PATCH] fix: delete useless docs --- docs/.vuepress/notes/en/guide.ts | 12 - docs/.vuepress/notes/zh/guide.ts | 12 - .../text_evaluation_operators.md | 1556 ---------------- .../text_generate_operators.md | 313 ---- .../text_process_operators.md | 252 --- .../text_evaluation_operators.md | 1567 ----------------- .../text_generate_operators.md | 312 ---- .../text_process_operators.md | 249 --- 8 files changed, 4273 deletions(-) delete mode 100644 docs/en/notes/guide/general_operators/text_evaluation_operators.md delete mode 100644 docs/en/notes/guide/general_operators/text_generate_operators.md delete mode 100644 docs/en/notes/guide/general_operators/text_process_operators.md delete mode 100644 docs/zh/notes/guide/general_operators/text_evaluation_operators.md delete mode 100644 docs/zh/notes/guide/general_operators/text_generate_operators.md delete mode 100644 docs/zh/notes/guide/general_operators/text_process_operators.md diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts index 37a5b8cbe..6ea317d79 100644 --- a/docs/.vuepress/notes/en/guide.ts +++ b/docs/.vuepress/notes/en/guide.ts @@ -44,7 +44,6 @@ export const Guide: ThemeNote = defineNoteConfig({ 'prompted_vqa', 'mathquestion_extract', 'knowledge_cleaning', - 'quick_general_text_evaluation', 'speech_transcription', ], @@ -97,17 +96,6 @@ export const Guide: ThemeNote = defineNoteConfig({ "easy_evaluation", ] }, - { - text: "General Operators", - collapsed: false, - icon: 'material-symbols:analytics-outline', - prefix: 'general_operators', - items: [ - "text_evaluation_operators", - "text_process_operators", - "text_generate_operators", - ] - }, { text: "Domain-Specific Operators", collapsed: false, diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts index 31fed3995..f784356f8 100644 --- a/docs/.vuepress/notes/zh/guide.ts +++ b/docs/.vuepress/notes/zh/guide.ts @@ -52,7 +52,6 @@ export const Guide: ThemeNote = defineNoteConfig({ "prompted_vqa", "mathquestion_extract", 'knowledge_cleaning', - 'quick_general_text_evaluation', 'speech_transcription', ], }, @@ -95,17 +94,6 @@ export const Guide: ThemeNote = defineNoteConfig({ "easy_evaluation", ] }, - { - text: "通用算子(移动到API)", - collapsed: false, - icon: 'material-symbols:analytics-outline', - prefix: 'general_operators', - items: [ - "text_evaluation_operators", - "text_process_operators", - "text_generate_operators", - ] - }, { text: "专用算子(移动到API)", collapsed: false, diff --git a/docs/en/notes/guide/general_operators/text_evaluation_operators.md b/docs/en/notes/guide/general_operators/text_evaluation_operators.md deleted file mode 100644 index 82561aca6..000000000 --- a/docs/en/notes/guide/general_operators/text_evaluation_operators.md +++ /dev/null @@ -1,1556 +0,0 @@ ---- -title: General Data Evaluation Operators -createTime: 2025/06/09 11:43:25 -permalink: /en/guide/text_evaluation_operators/ ---- - -# Text Data Evaluation Metrics - -## Text quality evaluation - -Scorers are divided into the following four types, each scorer provides one or more scores. - -
| Type | -Count | -Description | -
|---|---|---|
| APIcaller | -3 | -Call API for scoring | -
| Diversity | -2 | -Compute diversity score of the entire dataset | -
| Models | -12 | -Model or classifier-based scoring | -
| Statistics | -3 | -Statistical metric scoring | -
| Name | -Evaluation Dimension | -Data Type | -Description | -Value Range | -Official Repository or Paper | -
|---|---|---|---|---|---|
| AlpagasusScorer✨ | -Content Accuracy & Effectiveness | -Instruction | -Evaluates the quality of instructions by calling GPT, returning a quality score. A higher score indicates higher instruction quality. | -[0, 5] | -paper | -
| PerspectiveScorer✨ | -Safety | -Text | -Uses PerspectiveAPI to evaluate the toxicity of the text, returning a toxicity probability. A higher score indicates higher text toxicity. | -[0, 1] | -API | -
| TreeinstructScorer✨ | -Diversity & Complexity | -Instruction | -Measures instruction complexity by generating the number of nodes in the syntax tree; more nodes indicate more complex instructions. | -- | -paper | -
| Name | -Evaluation Dimension | -Data Type | -Description | -Value Range | -Official Repository or Paper | -
|---|---|---|---|---|---|
| Task2VecScorer✨ | -Diversity & Complexity | -Text | -Evaluates the diversity of the dataset using the Task2Vec method. Higher scores indicate higher dataset diversity. | -[0.0525±3.41E-4, 0.4037±1.932E-5] | -paper code |
-
| VendiScorer | -Diversity & Complexity | -Text | -Evaluates dataset diversity by calculating VendiScore; higher scores indicate higher diversity. | -- | -paper code |
-
| Name | -Evaluation Dimension | -Data Type | -Description | -Value Range | -Official Repository or Paper | -
|---|---|---|---|---|---|
| DebertaV3Scorer✨ | -Content Accuracy & Effectiveness | -Text | -A quality classifier based on NVIDIA's DeBERTa V3 model for evaluating text quality. | -{Low, Medium, High} | -code | -
| FineWebEduScorer✨ | -Educational Value | -Text | -A classifier for evaluating the educational value of text; higher scores indicate higher educational value. | -[0, 5] | -paper code |
-
| InstagScorer✨ | -Diversity & Complexity | -Instruction | -Evaluates instruction content diversity by returning the number of tags; more tags indicate higher content diversity. | -- | -paper code |
-
| PerplexityScorer | -Fluency & Understandability | -Text | -Calculates text perplexity using the KenLM model; lower scores indicate higher fluency and understandability. | -- | -paper code |
-
| QuratingScorer✨ | -Content Accuracy & Effectiveness、 Educational Value | -Text | -Evaluates text quality using the Qurating model; higher scores indicate higher quality. | -- | -paper code |
-
| PairQualScorer🚀 | -Educational Value | -Text | -Evaluates the quality of text using the PairQual model, based on the BGE model. It supports both Chinese and English. It is trained by scoring pairwise comparisons of texts using GPT. A higher score indicates better quality. | -- | -code |
-
| PresidioScorer✨ | -Safety | -Text | -Using the Microsoft Presidio model, identify private entities (PII) in text such as credit card numbers, names, locations, etc. The scorer returns the number of PII information. | -- | -code | -
| SuperfilteringScorer✨ | -Fluency & Understandability | -Instruction | -Evaluates the following difficulty of instructions using the Superfiltering method; higher scores indicate more difficult instructions to follow. | -- | -paper code |
-
| TextbookScorer✨ | -Educational Value | -Text | -A textbook quality classifier based on FastText, used to evaluate the educational value of text. | -[0, 2] | -paper code |
-
| DeitaQualityScorer✨ | -Content Accuracy & Effectiveness | -Instruction | -An instruction quality scorer based on the Llama model; higher scores indicate higher instruction quality. | -[1, 6] | -paper code |
-
| DeitaComplexityScorer✨ | -Diversity & Complexity | -Instruction | -An instruction complexity scorer based on the Llama model; higher scores indicate higher instruction complexity. | -[1,6] | -paper code |
-
| RMScorer✨ | -Fluency & Understandability | -指令 | -A reward-model-deberta-v3-large-v2 scorer based on human value judgment. High scores represent higher quality. | -- | -code | -
| Name | -Evaluation Dimension | -Data Type | -Description | -Value Range | -Official Repository or Paper | -
|---|---|---|---|---|---|
| LangkitScorer | -Text Structure, Fluency & Understandability | -Text | -Calculates statistical information of text using the Langkit toolkit, such as word count, sentence count, syllable count, etc., to help evaluate the structural complexity and readability of the text. | -- | -code | -
| LexicalDiversityScorer✨ | -Diversity & Complexity | -Text | -Calculates lexical diversity scores using MTLD and HD-D methods; higher scores represent richer vocabulary use, reflecting the diversity and complexity of the text. | -- | -paper code |
-
| NgramScorer | -Diversity & Complexity | -Text | -Calculates the repetition ratio of n-grams in the text to measure text repetition; higher scores indicate lower repetition of n-grams in the text. | -[0, 1] | -- | -
| Scorer Name | -Score Metric Name | -Description | -Mean | -Variance | -Max | -Min | -
|---|---|---|---|---|---|---|
| PerspectiveScorer | -PerspectiveScore | -Evaluates the toxicity of the text, checking for potential insults or inappropriate language. The higher the score, the higher the toxicity | -0.0426 | -0.0025 | -0.2610 | -0.0026 | -
| LexicalDiversityScorer | -LexicalDiversityMTLDScore | -Measures the lexical diversity of the text; higher scores indicate more varied vocabulary usage.The higher the score, the higher the lexical diversity | -100.5990 | -1625.1318 | -1165.7164 | -14.8439 | -
| LexicalDiversityHD-DScore | -Used to measure the lexical diversity of the text, calculated based on discrete distribution.The higher the score, the higher the lexical diversity | -0.8487 | -0.0014 | -0.9873 | -0.5570 | -|
| NgramScorer | -NgramScore | -Calculate the repetition ratio of n-grams in the text to measure the degree of repetition. The higher the score, the lower the n-gram repetition. | -0.9938 | -0.0002 | -1.0 | -0.8285 | -
| LangkitScorer | -LangkitFleschReadingEaseScore | -Measures Flesch text readability. The higher the score, the easier readability. | -55.1870 | -324.8975 | -106.37 | --144.75 | -
| LangkitAutomatedReadabilityIndexScore | -Automated readability index based on sentence length and vocabulary difficulty.The higher the score, the more difficult readability | -11.7727 | -19.4117 | -98.2 | -0.9 | -|
| LangkitAggregateReadingLevelScore | -Aggregate reading difficulty score of the text.The higher the score, the more difficult readability | -11.2332 | -13.6816 | -77.0 | -0.0 | -|
| LangkitSyllableCountScore | -Counts the total number of syllables in the text. The higher the score, the more syllables there are. | -815.3852 | -2299853.7272 | -43237 | -32 | -|
| LangkitLexiconCountScore | -Counts the total number of words in the text. The higher the score, the more words there are. | -524.178 | -1061058.5875 | -33033 | -23 | -|
| LangkitSentenceCountScore | -Counts the total number of sentences in the text. The higher the score, the more sentences there are. | -28.9664 | -3618.2549 | -2193 | -1 | -|
| LangkitCharacterCountScore | -Counts the total number of characters in the text. The higher the score, the more characters there are. | -2610.2462 | -23580442.8820 | -139807 | -118 | -|
| LangkitLetterCountScore | -Counts the total number of letters in the text. The higher the score, the more letters there are. | -2513.4572 | -21890120.2030 | -134507 | -109 | -|
| LangkitPolysyllableCountScore | -Counts the number of polysyllabic words in the text. The higher the score, the more polysyllabic words there are. | -78.8834 | -18918.1990 | -3261 | -0 | -|
| LangkitMonosyllableCountScore | -Counts the number of monosyllabic words, which are usually related to the text's simplicity. The higher the score, the more monosyllabic words there are. | -334.6674 | -503285.5160 | -25133 | -13 | -|
| LangkitDifficultWordsScore | -Counts the number of difficult words in the text. The higher the score, the more difficult words there are. | -93.4112 | -14401.2789 | -2366 | -4 | -|
| TextbookScorer | -TextbookScore | -Tests whether the text meets textbook standards. The higher the score, the closer the text is to an ideal textbook. | -0.9255 | -0.1779 | -1.9867 | -0.0001 | -
| FineWebEduScorer | -FineWebEduScore | -Measures the educational value of the text. The higher the score, the greater the educational value. | -1.1901 | -0.4924 | -4.6827 | --0.6319 | -
| DebertaV3Scorer | -DebertaV3Score | -Text evaluation using the DebertaV3 model. Quality scores are classified as high, medium, or low. | -Medium: 3180 times | -- | -High: 1412 times | -Low: 408 times | -
| PerplexityScorer | -PerplexityScore | -Measures the perplexity of the text. The higher the score, the greater the model's perplexity. | -564.3942 | -165893.5542 | -8271.0 | -13.9 | -
| QuratingScorer | -QuratingWritingStyleScore | -Evaluates the quality of the text's writing style. The higher the score, the better the writing style. | -0.6453 | -6.7949 | -8.375 | --7.3474 | -
| QuratingRequiredExpertiseScore | -Measures the level of expertise required for the text. The higher the score, the more expertise is required. | --0.4661 | -7.0458 | -9.0 | --8.25 | -|
| QuratingFactsAndTriviaScore | -Tests whether the text contains facts and trivia. The higher the score, the more facts and trivia the text contains. | -0.1889 | -4.5678 | -7.4688 | --6.0993 | -|
| QuratingEducationalValueScore | -Measures the educational value of the text. The higher the score, the greater the educational value. | -1.2946 | -11.2196 | -11.5625 | --8.7843 | -|
| InstagScorer | -InstagScore | -Evaluates the content diversity by returning the number of tags. The higher the score, the greater the content diversity. | -2.304 | -2.9396 | -11 | -1 | -
| SuperfilteringScorer | -SuperfilteringScore | -Evaluates the instruction-following difficulty using the Superfiltering method. The higher the score, the more difficult it is to follow the instructions. | -1.3223 | -836.0302 | -1978.6534 | -0.0011 | -
| DeitaQualityScorer | -DeitaQualityScore | -Instruction quality evaluation based on the Llama model. The higher the score, the better the quality of the instructions. | -3.5629 | -0.9247 | -5.5309 | -1.0840 | -
| DeitaComplexityScorer | -DeitaComplexityScore | -Instruction complexity evaluation based on the Llama model. The higher the score, the greater the complexity of the instructions. | -1.4936 | -0.2086 | -3.3207 | -1.0001 | -
| VendiScorer | -N-grams_VendiScore | -Evaluates text diversity based on N-grams embeddings. The higher the score, the greater the dataset diversity. | -1832.96 | -- | -- | -- | -
| BERT_VendiScore | -Evaluates text diversity based on BERT embeddings. The higher the score, the greater the dataset diversity. | -1.83 | -- | -- | -- | -|
| SimCSE_VendiScore | -Evaluates text diversity based on SimCSE embeddings. The higher the score, the greater the dataset diversity. | -68.94 | -- | -- | -- | -|
| Task2VecScorer | -Task2VecScore | -Evaluates dataset diversity using Task2Vec diversity coefficient. The higher the score, the greater the dataset diversity. | -0.0673 | -- | -- | -- | -
| AlpagasusScorer | -AlpagasusScore | -Evaluates instruction quality using ChatGPT. The higher the score, the better the quality of the instructions. | -4.172 | -0.2164 | -5.0 | -2.0 | -
| TreeinstructScorer | -TreeinstructScore | -Uses ChatGPT to evaluate the semantic complexity of instructions. The higher the score, the greater the semantic complexity of the instruction. | -6.494 | -9.7540 | -63.0 | -0.0 | -
| PresidioScorer | -PresidioScore | -Uses Presidio to evaluate the number of PII (Personally Identifiable Information) instances. The higher the score, the more PII information is present in the text. | -21.4008 | -2915.3542 | -1786.0 | -0.0 | -
| RMScorer | -RMScore | -Uses a reward model based on human values to evaluate the quality of SFT (Supervised Fine-Tuning) data. The higher the score, the better the data quality. | -3.1537 | -9.9461 | -8.6803 | --4.9680 | -
| Scorer Name | -Description | -Value Range | -Description | -
|---|---|---|---|
| BLEU Scorer | -Calculates precision based on n-gram matching by comparing n-grams in generated and reference texts | -[0, 1] | -Higher values indicate greater match between generated and reference texts | -
| CIDEr Scorer | -Uses TF-IDF weighted n-gram statistics to compare similarity between generated and reference descriptions | -[0, 1] | -Higher values indicate stronger content consistency between generated and reference texts | -
| BertScore | -Computes similarity of word embeddings between generated and reference texts using BERT | -[0, 1] | -Higher values indicate stronger semantic similarity between generated and reference texts | -
| Name | -Applicable Type | -Description | -Repository or Paper | -
|---|---|---|---|
| PretrainGenerator | -Pretrain | -Synthesize phi-4 question and answer data pairs using pre trained document data, and retell the document in QA format | -Paper | -
| SFTGeneratorSeed | -SFT | -Synthesize SFT format QA data pairs based on seed documents and return original information | -- | -
| CondorGenerator | -SFT | -Two-stage synthesis of SFT-format data from scratch based on preset knowledge tree labels (recommend increasing label variety if generating more than 5000 samples) | -paper | -
| PromptedGenerator | -- | -Generate data based on user-defined prompts | -- | -
| ConsistentChatGenerator | -Multi-turn Dialogue | -Two-stage synthesis of multi-turn dialogue data from scratch based on preset topics and human intents (recommend increasing label variety if generating more than 9000 samples) | -paper | -
| Type | -Count | -Description | -
|---|---|---|
| Refiners | -16 | -Improves the content of data points through processing and augmentation without altering the total count. | -
| Deduplicators | -6 | -Removes duplicate data points using methods such as hashing. | -
| Filters | -42 | -Filters data points based on thresholds and other criteria. | -
| Name | -Applicable Type | -Description | -Repository or Paper | -
|---|---|---|---|
| CondorRefiner | -SFT | -Generate evaluations and rewrites of SFT responses using LLM APIs to improve QA quality | -paper | -
| LowercaseRefiner | -NLP | -Converts text fields to lowercase. | -- | -
| PIIAnonymizeRefiner | -Pre-training | -Anonymizes Personally Identifiable Information (PII), such as names and locations, to protect privacy. | -Code | -
| RemovePunctuationRefiner | -NLP | -Removes punctuation from text. | -- | -
| RemoveNumberRefiner | -NLP | -Removes numeric characters from text. | -- | -
| RemoveExtraSpacesRefiner | -NLP, Pre-training | -Replaces multiple consecutive spaces with a single space and trims leading/trailing spaces. | -- | -
| RemoveRepetitionsPunctuationRefiner | -NLP | -Removes repeated punctuation, e.g., "!!!" becomes "!". | -- | -
| RemoveEmojiRefiner | -Pre-training | -Removes emojis from text, e.g., "😀". | -Code | -
| RemoveEmoticonsRefiner | -Pre-training | -Removes emoticons such as ":-)", using a predefined list. | -Code | -
| RemoveContractionsRefiner | -NLP | -Expands contractions in text, e.g., "can't" becomes "cannot". | -Code | -
| HtmlUrlRemoverRefiner | -Pre-training | -Removes URLs and HTML tags from text. | -- | -
| TextNormalizationRefiner | -NLP | -Normalizes formats for dates, currencies, etc., in text. | -- | -
| NERRefiner | -NLP | -Uses Named Entity Recognition (NER) to identify and mask specific entities in text. | -Code | -
| StemmingLemmatizationRefiner | -NLP | -Performs stemming or lemmatization on text. | -Code | -
| SpellingCorrectionRefiner | -NLP, Pre-training | -Corrects spelling errors in text using SymSpell. | -Code | -
| RemoveStopwordsRefiner | -NLP | -Removes stopwords (e.g., "the", "is") from text. | -Code | -
| Name | -Type | -Description | -Repository or Paper | -
|---|---|---|---|
| HashDeduplicator | -Exact Deduplication | -Uses various hash functions (e.g., MD5, SHA256, XXH3_128) to remove duplicate data based on exact hash value comparison. Suitable for small-scale simple deduplication. | -- | -
| CCNetDeduplicator | -Exact Deduplication | -Compares the first 64 bits of the SHA-1 hash to identify duplicate text, balancing security and computational efficiency. | -- | -
| NgramHashDeduplicator | -Near Deduplication | -Combines n-gram techniques with hashing to detect duplicates based on multiple hash comparisons of n-gram segments. Useful for identifying near-duplicates. | -Paper | -
| SemDeduplicator | -Near Deduplication | -Uses semantic similarity based on BERT embeddings and cosine similarity to detect duplicates. Ideal for detecting semantically similar but differently phrased text. | -Paper Code |
-
| SimHashDeduplicator | -Near Deduplication | -Uses the SimHash algorithm to detect similar text based on Hamming distance of fingerprints. Efficient for large-scale data deduplication. | -Paper | -
| MinHashDeduplicator | -Near Deduplication | -Combines MinHash and LSH to compare sets with minimal memory usage and computation cost, detecting similarity between sets. | -Paper | -
| Name | -Applicable Type | -Description | -Repository or Paper | -
|---|---|---|---|
| GeneralFilter | -Any DataFrame | -Supports flexible filtering of the DataFrame using one or more custom lambda functions | -- | -
| LanguageFilter | -Pre-training, SFT | -Filters specific languages using the fasttext language identification model. | -Huggingface | -
| BlocklistFilter | -Pre-training, SFT | -Filters data points using a blocklist (e.g., List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words). | -Code | -
| 类型 | -数量 | -描述 | -
|---|---|---|
| APIcaller | -3 | -调用API打分 | -
| Diversity | -2 | -计算整个数据集的多样性得分 | -
| Models | -12 | -基于模型、分类器打分 | -
| Statistics | -3 | -统计学指标打分 | -
| 名称 | -评估维度 | -数据类型 | -简介 | -取值范围 | -官方仓库或论文 | -
|---|---|---|---|---|---|
| AlpagasusScorer✨ | -内容准确性与有效性 | -指令 | -通过调用 GPT 评估指令的质量,返回一个质量得分,得分越高表明指令的质量越高。 | -[0, 5] | -paper | -
| PerspectiveScorer✨ | -安全性 | -文本 | -使用 PerspectiveAPI 评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。 | -[0, 1] | -API | -
| TreeinstructScore✨ | -多样性与复杂性 | -指令 | -通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。 | -- | -paper | -
| 名称 | -评估维度 | -数据类型 | -简介 | -取值范围 | -官方仓库或论文 | -
|---|---|---|---|---|---|
| Task2VecScorer✨ | -多样性与复杂性 | -文本 | -评估数据集的多样性,使用 Task2Vec 方法,高分表示数据集具有较高的多样性。 | -[0.0525±3.41E-4, 0.4037±1.932E-5] | -paper code |
-
| VendiScorer | -多样性与复杂性 | -文本 | -通过计算 VendiScore 来评估数据集的多样性,得分越高表示多样性越高。 | -- | -paper code |
-
| 名称 | -评估维度 | -数据类型 | -简介 | -取值范围 | -官方仓库或论文 | -
|---|---|---|---|---|---|
| DebertaV3Scorer✨ | -内容准确性与有效性 | -文本 | -基于 Nvidia Deberta V3 模型的质量分类器,用于评估文本质量。 | -{Low, Medium, High} | -code | -
| FineWebEduScorer✨ | -教育价值 | -文本 | -用于评估文本教育价值的分类器,高分表示文本具有较高的教育价值。 | -[0, 5] | -paper code |
-
| InstagScorer✨ | -多样性与复杂性 | -指令 | -通过返回标签的数量来评估指令的内容多样性,标签越多表示内容多样性越大。 | -- | -paper code |
-
| PerplexityScorer | -流畅性与可理解性 | -文本 | -基于 Kenlm 模型计算文本的困惑度,困惑度越低,文本的流畅性和可理解性越高。 | -- | -paper code |
-
| QuratingScorer✨ | -内容准确性与有效性、教育价值 | -文本 | -通过 Qurating 模型评估文本的质量,得分越高表示质量越高。 | -- | -paper code |
-
| PairQualScorer🚀 | -教育价值 | -文本 | -通过 PairQual 模型评估文本的质量,基于bge模型,支持中英双语,使用gpt对文本成对比较打分后训练而成。得分越高表示质量越高。 | -- | -code |
-
| PresidioScorer✨ | -安全性 | -文本 | -使用Microsoft Presidio模型,识别文本中的私人实体(PII)如信用卡号、姓名、位置等。打分器返回PII信息个数。 | -- | -code | -
| SuperfilteringScorer✨ | -流畅性与可理解性 | -指令 | -使用 Superfiltering 方法评估指令的跟随难度,得分越高表示指令越难跟随。 | -- | -paper code |
-
| TextbookScorer✨ | -教育价值 | -文本 | -基于 FastText 分类器的课本质量分类器,用于评估文本的教育价值。 | -[0, 2] | -paper code |
-
| DeitaQualityScorer✨ | -内容准确性与有效性 | -指令 | -基于 Llama 模型的 Deita 指令质量评估器,高分表示指令质量较高。 | -[1,6] | -paper code |
-
| DeitaComplexityScorer✨ | -多样性与复杂性 | -指令 | -基于 Llama 模型的 Deita 指令复杂性评估器,高分表示指令复杂性较高。 | -[1,6] | -paper code |
-
| RMScorer✨ | -流畅性与可理解性 | -指令 | -基于人类价值判断的奖励模型reward-model-deberta-v3-large-v2质量评分器。高分代表质量较高。 | -- | -code | -
| 名称 | -评估维度 | -数据类型 | -简介 | -取值范围 | -官方仓库或论文 | -
|---|---|---|---|---|---|
| LangkitScorer | -文本结构, 流畅性与可理解性 | -文本 | -使用Langkit工具包计算文本的统计信息,如字数、句子数、音节数等,帮助评估文本的结构复杂性和可读性。 | -- | -code | -
| LexicalDiversityScorer✨ | -多样性与复杂性 | -文本 | -使用MTLD和HDD方法计算词汇多样性评分,高分代表更丰富的词汇使用,反映文本的多样性和复杂性。 | -- | -paper code |
-
| NgramScorer | -多样性与复杂性 | -文本 | -计算文本中n-gram的重复比例,用以衡量文本的重复度,得分越高表示文本中重复的n-gram比例越低。 | -[0, 1] | -- | -
| 打分器名称 | -分数指标名称 | -简介 | -均值 | -方差 | -最大值 | -最小值 | -
|---|---|---|---|---|---|---|
| PerspectiveScorer | -PerspectiveScore | -评估文本的毒性,是否含有潜在的侮辱性或不当言论。分数越高毒性越大。 | -0.0426 | -0.0025 | -0.2610 | -0.0026 | -
| LexicalDiversityScorer | -LexicalDiversityMTLDScore | -测量文本的词汇多样性。分数越高词汇多样性越大。 | -100.5990 | -1625.1318 | -1165.7164 | -14.8439 | -
| LexicalDiversityHD-DScore | -用于衡量文本的词汇多样性,基于离散分布计算。分数越高词汇多样性越大。 | -0.8487 | -0.0014 | -0.9873 | -0.5570 | -|
| NgramScorer | -NgramScore | -计算文本中n-gram的重复比例,用以衡量文本的重复度。分数越高N-gram重复性越低。 | -0.9938 | -0.0002 | -1.0 | -0.8285 | -
| LangkitScorer | -LangkitFleschReadingEaseScore | -衡量文本的Flesch可读性。得分越高表示越易读。 | -55.1870 | -324.8975 | -106.37 | --144.75 | -
| LangkitAutomatedReadabilityIndexScore | -自动可读性指标,基于句子长度和词汇难度。得分越高表示越难读。 | -11.7727 | -19.4117 | -98.2 | -0.9 | -|
| LangkitAggregateReadingLevelScore | -综合文本的阅读难度评分。得分越高表示越难读。 | -11.2332 | -13.6816 | -77.0 | -0.0 | -|
| LangkitSyllableCountScore | -统计文本中音节的总数。得分越高音节数量越大。 | -815.3852 | -2299853.7272 | -43237 | -32 | -|
| LangkitLexiconCountScore | -统计文本中词汇的总数。得分越高词汇数量越大。 | -524.178 | -1061058.5875 | -33033 | -23 | -|
| LangkitSentenceCountScore | -统计文本中的句子数量。得分越高句子数量越大。 | -28.9664 | -3618.2549 | -2193 | -1 | -|
| LangkitCharacterCountScore | -统计文本中的字符数量。得分越高字符数量越大。 | -2610.2462 | -23580442.8820 | -139807 | -118 | -|
| LangkitLetterCountScore | -统计文本中的字母数量。得分越高字母数量越大。 | -2513.4572 | -21890120.2030 | -134507 | -109 | -|
| LangkitPolysyllableCountScore | -统计多音节单词的数量。得分越高多音节词数量越大。 | -78.8834 | -18918.1990 | -3261 | -0 | -|
| LangkitMonosyllableCountScore | -统计单音节单词的数量,通常与文本的简易度相关。得分越高单音节词数量越大。 | -334.6674 | -503285.5160 | -25133 | -13 | -|
| LangkitDifficultWordsScore | -统计文本中难词的数量。得分越高难词数量越大。 | -93.4112 | -14401.2789 | -2366 | -4 | -|
| TextbookScorer | -TextbookScore | -测试文本是否符合教科书标准。得分越高文本越接近理想教材。 | -0.9255 | -0.1779 | -1.9867 | -0.0001 | -
| FineWebEduScorer | -FineWebEduScore | -测量文本的教育价值。得分越高文本教育价值越大。 | -1.1901 | -0.4924 | -4.6827 | --0.6319 | -
| DebertaV3Scorer | -DebertaV3Score | -使用DebertaV3模型进行的文本评估。评估质量得分按高、中、低分类。 | -Medium: 3180 次 | -- | -High: 1412 次 | -Low: 408 次 | -
| PerplexityScorer | -PerplexityScore | -衡量文本的困惑度。得分越高模型困惑度越大。 | -564.3942 | -165893.5542 | -8271.0 | -13.9 | -
| QuratingScorer | -QuratingWritingStyleScore | -评估文本的写作风格是否良好。得分越高文本写作风格越好。 | -0.6453 | -6.7949 | -8.375 | --7.3474 | -
| QuratingRequiredExpertiseScore | -衡量文本需要的专业知识水平。得分越高文本越需要专业知识。 | --0.4661 | -7.0458 | -9.0 | --8.25 | -|
| QuratingFactsAndTriviaScore | -测试文本是否包含事实和趣闻。得分越高文本包含的事实和趣闻越多。 | -0.1889 | -4.5678 | -7.4688 | --6.0993 | -|
| QuratingEducationalValueScore | -衡量文本的教育价值。得分越高文本教育价值越大。 | -1.2946 | -11.2196 | -11.5625 | --8.7843 | -|
| InstagScorer | -InstagScore | -通过返回标签的数量来评估指令的内容多样性。得分越高内容多样性越大。 | -2.304 | -2.9396 | -11 | -1 | -
| SuperfilteringScorer | -SuperfilteringScore | -使用 Superfiltering 方法评估指令的跟随难度。得分越高指令跟随难度越大。 | -1.3223 | -836.0302 | -1978.6534 | -0.0011 | -
| DeitaQualityScorer | -DeitaQualityScore | -基于 Llama 模型的 Deita 指令质量评估器。得分越高指令质量越好。 | -3.5629 | -0.9247 | -5.5309 | -1.0840 | -
| DeitaComplexityScorer | -DeitaComplexityScore | -基于 Llama 模型的 Deita 指令复杂性评估器。得分越高指令复杂性越大。 | -1.4936 | -0.2086 | -3.3207 | -1.0001 | -
| VendiScorer | -N-grams_VendiScore | -基于N-grams嵌入评估文本多样性得分。得分越高数据集多样性越大。 | -1832.96 | -- | -- | -- | -
| BERT_VendiScore | -基于BERT嵌入评估文本多样性得分。得分越高数据集多样性越大。 | -1.83 | -- | -- | -- | -|
| SimCSE_VendiScore | -基于SimCSE嵌入计算文本多样性得分。得分越高数据集多样性越大。 | -68.94 | -- | -- | -- | -|
| Task2VecScorer | -Task2VecScore | -使用Task2Vec多样性系数评估数据集多样性。得分越高数据集多样性越大。 | -0.0673 | -- | -- | -- | -
| AlpagasusScorer | -AlpagasusScore | -调用ChatGPT评估指令质量得分。得分越高指令质量越好。 | -4.172 | -0.2164 | -5.0 | -2.0 | -
| TreeinstructScorer | -TreeinstructScore | -调用ChatGPT评估指令语义复杂度。得分越高指令语义复杂度越高。 | -6.494 | -9.7540 | -63.0 | -0.0 | -
| PresidioScorer | -PresidioScore | -使用Presidio评估PII个数。得分越高文本含义PII信息越多。 | -21.4008 | -2915.3542 | -1786.0 | -0.0 | -
| RMScorer | -RMScore | -使用基于人类价值的奖励模型评估SFT数据质量得分越高数据质量越高。 | -3.1537 | -9.9461 | -8.6803 | --4.9680 | -
| 打分器名称 | -简介 | -取值范围 | -值解释 | -
|---|---|---|---|
| BLEU Scorer | -基于 n-gram 匹配的精确度计算,将生成文本中的 n-gram 与参考文本中的 n-gram 进行匹配并计算精确度 | -[0, 1] | -值越大,表示生成文本与参考文本的匹配程度越高 | -
| CIDEr Scorer | -利用 TF-IDF 加权的 n-gram 统计,将生成文本的描述与参考描述进行相似性比较 | -[0, 1] | -值越大,表示生成文本与参考文本在内容上越一致 | -
| BertScorer | -使用 Bert 模型计算生成文本与参考文本的词向量相似性,输出精确度、召回率和 F1 分数 | -[0, 1] | -值越大,表示生成文本与参考文本在语义上越相似 | -
| 名称 | -适用类型 | -简介 | -官方仓库或论文 | -
|---|---|---|---|
| PretrainGenerator | -预训练 | -使用预训练文档数据合成类phi-4问答数据对,使用QA格式复述文档 | -Paper | -
| SFTGeneratorSeed | -SFT | -根据种子文档合成SFT格式QA数据对,并返回原文信息 | -- | -
| CondorGenerator | -SFT | -根据预置知识树标签,两阶段从0合成SFT格式数据(合成数量大于5000时建议增加标签数量) | -paper | -
| PromptedGenerator | -- | -根据用户自定义prompt进行数据生成 | -- | -
| ConsistentChatGenerator | -多轮对话 | -根据预置主题和人类意图,两阶段从0合成多轮对话格式数据(合成数量大于9000时建议增加标签数量) | -paper | -
| 类型 | -数量 | -描述 | -
|---|---|---|
| 数据改写器 | -16 | -通过数据处理、数据增强等方式改善数据点内容(不改变总数量) | -
| 数据去重器 | -6 | -通过哈希等方法进行数据点去重 | -
| 数据过滤器 | -42 | -通过设置阈值等方式过滤数据点 | -
| 名称 | -适用类型 | -简介 | -官方仓库或论文 | -
|---|---|---|---|
| CondorRefiner | -SFT | -利用大模型API生成对SFT回复的评价并改写,提升QA对质量 | -paper | -
| LowercaseRefiner | -NLP | -将文本字段中的内容转换为小写 | -- | -
| PIIAnonymizeRefiner | -预训练 | -通过识别和匿名化个人身份信息(PII),如姓名、位置等,来保护隐私 | -Code | -
| RemovePunctuationRefiner | -NLP | -移除文本中的标点符号 | -- | -
| RemoveNumberRefiner | -NLP | -移除文本中的数字字符 | -- | -
| RemoveExtraSpacesRefiner | -NLP、预训练 | -移除文本中的多余空格,将连续的多个空格替换为单个空格,并去除文本前后空格 | -- | -
| RemoveRepetitionsPunctuationRefiner | -NLP | -移除重复的标点符号,例如“!!!”变为“!” | -- | -
| RemoveEmojiRefiner | -预训练 | -移除文本中的表情符号,例如"😀" | -Code | -
| RemoveEmoticonsRefiner | -预训练 | -移除文本中的表情符号,例如“:‑)”,使用预定义的表情符号列表 | -Code | -
| RemoveContractionsRefiner | -NLP | -扩展文本中的缩写词(例如将“can't”扩展为“cannot”) | -Code | -
| HtmlUrlRemoverRefiner | -预训练 | -移除文本中的URL和HTML标签 | -- | -
| TextNormalizationRefiner | -NLP | -规范化文本中的日期格式、货币格式等 | -- | -
| NERRefiner | -NLP | -使用命名实体识别(NER)技术识别并屏蔽文本中的特定实体 | -Code | -
| StemmingLemmatizationRefiner | -NLP | -对文本进行词干提取或词形还原 | -Code | -
| SpellingCorrectionRefiner | -NLP、预训练 | -通过SymSpell对文本中的拼写错误进行纠正 | -Code | -
| RemoveStopwordsRefiner | -NLP | -移除文本中的停用词(如“the”,“is”) | -Code | -
| 名称 | -类别 | -简介 | -官方仓库或论文 | -
|---|---|---|---|
| HashDeduplicator | -精确去重 | -使用多种哈希函数(如MD5、SHA256、XXH3_128)对文本进行哈希处理,通过精确的比较哈希值来识别和移除重复数据,适用于小规模简单去重场景。 | -- | -
| CCNetDeduplicator | -精确去重 | -基于SHA-1哈希算法的前64位进行比较,以识别重复文本。旨在平衡哈希安全性和计算效率。 | -- | -
| NgramHashDeduplicator | -近似去重 | -结合n-gram技术与哈希算法,将文本分割为多个n-gram片段并分别进行哈希处理。通过多个哈希值的比较来识别相似或重复的文本,适用于处理具有细微差异的重复数据。 | -Paper | -
| SemDeduplicator | -近似去重 | -基于BERT模型的语义相似度计算,通过生成文本的嵌入向量并计算余弦相似度来识别重复内容。适用于需要语义理解的高级去重场景,能够识别语义上相似但表述不同的文本。 | -Paper Code |
-
| SimHashDeduplicator | -近似去重 | -采用SimHash算法,通过生成文本的SimHash指纹并计算汉明距离来判断文本的相似度。适用于高效的相似文本检测,能够快速处理大规模数据集中的重复或相似文本。 | -Paper | -
| MinHashDeduplicator | -近似去重 | -结合MinHash与LSH,通过将集合中的元素哈希成一个较小的签名(通常是一个固定长度的整数或比特串),从而以很小的内存占用和低计算成本比较两个集合之间的相似度。 | -Paper | -
| 名称 | -适用类型 | -简介 | -官方仓库或论文 | -
|---|---|---|---|
| GeneralFilter | -任意Dataframe | -支持通过一/多个自定义lambda函数对 DataFrame 进行灵活过滤 | -- | -
| LanguageFilter | -预训练、SFT | -使用fasttext语言识别模型过滤特定语言 | -Huggingface | -
| BlocklistFilter | -预训练、SFT | -设置阈值,根据List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words屏蔽词表过滤数据点 | -Code | -