diff --git a/README.md b/README.md index 5deeb16..cd0d0a2 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,8 @@ pip install workrb | Skill Extraction House | `HouseSkillExtractRanking` | multi_label | 262 queries x 13891 targets | 28 | | Skill Extraction Tech | `TechSkillExtractRanking` | multi_label | 338 queries x 13891 targets | 28 | | Skill Extraction SkillSkape | `SkillSkapeExtractRanking` | multi_label | 1191 queries x 13891 targets | 28 | +| Skill Extraction TechWolf | `TechWolfSkillExtractRanking` | multi_label | 326 queries x 13891 targets | 28 | +| Skill Extraction SkillXL | `SkillXLSkillExtractRanking` | multi_label | 944 queries x 13891 targets | 28 | | Skill Similarity SkillMatch-1K | `SkillMatch1kSkillSimilarityRanking` | single_label | 900 queries x 2648 targets | 1 | | Skill Normalization ESCO | `ESCOSkillNormRanking` | multi_label | 72008 queries x 13939 targets | 28 | | Skill Normalization MELS | `MELSRanking` | multi_label | 1722 queries x 19466 targets | 5 | diff --git a/src/workrb/tasks/__init__.py b/src/workrb/tasks/__init__.py index b408655..ae66be8 100644 --- a/src/workrb/tasks/__init__.py +++ b/src/workrb/tasks/__init__.py @@ -21,7 +21,9 @@ from .ranking.skill_extraction import ( HouseSkillExtractRanking, SkillSkapeExtractRanking, + SkillXLSkillExtractRanking, TechSkillExtractRanking, + TechWolfSkillExtractRanking, ) from .ranking.skill_similarity import SkillMatch1kSkillSimilarityRanking from .ranking.skillnorm import ESCOSkillNormRanking @@ -46,7 +48,9 @@ "MELSRanking", "HouseSkillExtractRanking", "TechSkillExtractRanking", + "TechWolfSkillExtractRanking", "SkillSkapeExtractRanking", + "SkillXLSkillExtractRanking", "SkillMatch1kSkillSimilarityRanking", "ProjectCandidateRanking", "SearchQueryCandidateRanking", diff --git a/src/workrb/tasks/ranking/__init__.py b/src/workrb/tasks/ranking/__init__.py index 2f0424f..63f87df 100644 --- a/src/workrb/tasks/ranking/__init__.py +++ b/src/workrb/tasks/ranking/__init__.py @@ -20,7 +20,9 @@ from workrb.tasks.ranking.skill_extraction import ( HouseSkillExtractRanking, SkillSkapeExtractRanking, + SkillXLSkillExtractRanking, TechSkillExtractRanking, + TechWolfSkillExtractRanking, ) from workrb.tasks.ranking.skill_similarity import SkillMatch1kSkillSimilarityRanking from workrb.tasks.ranking.skillnorm import ESCOSkillNormRanking @@ -38,5 +40,7 @@ "SearchQueryCandidateRanking", "SkillMatch1kSkillSimilarityRanking", "SkillSkapeExtractRanking", + "SkillXLSkillExtractRanking", "TechSkillExtractRanking", + "TechWolfSkillExtractRanking", ] diff --git a/src/workrb/tasks/ranking/skill_extraction.py b/src/workrb/tasks/ranking/skill_extraction.py index afad28b..e790a17 100644 --- a/src/workrb/tasks/ranking/skill_extraction.py +++ b/src/workrb/tasks/ranking/skill_extraction.py @@ -102,9 +102,7 @@ def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: if uri in target_uris_to_skill: original_skill_to_target_skill[orig_skill] = target_uris_to_skill[uri] - df["label"] = df["label"].apply( - lambda orig_skill: original_skill_to_target_skill.get(orig_skill) - ) + df["label"] = df["label"].apply(original_skill_to_target_skill.get) # Drop rows where label is None df = df[df["label"].notna()].reset_index(drop=True).copy() @@ -217,6 +215,142 @@ def citation(self) -> str: """ +@register_task() +class TechWolfSkillExtractRanking(BaseESCOSkillExtractRanking): + """Skill Extraction from TechWolf Dataset Ranking Task.""" + + orig_esco_version = "1.1.0" + + def __init__(self, esco_version: str = "1.1.0", **kwargs): + self.esco_version = esco_version + super().__init__(hf_name="TechWolf/skill-extraction-techwolf", **kwargs) + + @property + def name(self) -> str: + """Skill extraction TechWolf task name.""" + return "Skill Extraction TechWolf" + + @property + def description(self) -> str: + """Skill extraction TechWolf task description.""" + return ( + "Extract skills from text descriptions in a generic distribution of job descriptions." + ) + + @property + def citation(self) -> str: + """Skill extraction TechWolf task citation.""" + return """@article{decorte2023extreme, + title={Extreme multi-label skill extraction training using large language models}, + author={Decorte, Jens-Joris and Verlinden, Severine and Van Hautte, Jeroen and Deleu, Johannes and Develder, Chris and Demeester, Thomas}, + journal={arXiv preprint arXiv:2307.10778}, + year={2023} +} +""" + + +@register_task() +class SkillXLSkillExtractRanking(BaseESCOSkillExtractRanking): + """Skill Extraction from SkillXL Dataset Ranking Task.""" + + orig_esco_version = "1.1.0" + + def __init__(self, esco_version: str = "1.1.0", **kwargs): + self.esco_version = esco_version + super().__init__(hf_name="TechWolf/Skill-XL", **kwargs) + + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + """Load SkillXL data, filtering to relevant rows. + + SkillXL differs from the base class in two ways: it includes + irrelevant sentences (filtered via the ``relevant`` boolean column), + and the skill column is named ``skill`` instead of ``label``. + """ + language = Language(dataset_id) + # Load data + split_names = {DatasetSplit.TEST: "test", DatasetSplit.VAL: "validation"} + dataset = load_dataset(self.hf_name, split=split_names[split]) + assert isinstance(dataset, Dataset) + df = dataset.to_pandas() + assert isinstance(df, pd.DataFrame) + assert "relevant" in df.columns, "Expected 'relevant' column in the dataset." + + # Only keep rows where relevant is True + df = df[df["relevant"]].reset_index(drop=True).copy() + assert isinstance(df, pd.DataFrame) + + # If ESCO version is not 1.1.0 and / or language is not en, we need to translate the skills + if self.esco_version != self.orig_esco_version or language != Language.EN: + original_esco = ESCO(version=self.orig_esco_version, language=Language.EN) + original_skill_uris = original_esco.get_skills_uris() + original_uris_to_skill = {v: k for k, v in original_skill_uris.items()} + + target_esco = ESCO(version=self.esco_version, language=language) + target_skill_uris = target_esco.get_skills_uris() + target_uris_to_skill = {v: k for k, v in target_skill_uris.items()} + + original_skill_to_target_skill = {} + for uri, orig_skill in original_uris_to_skill.items(): + if uri in target_uris_to_skill: + original_skill_to_target_skill[orig_skill] = target_uris_to_skill[uri] + + df["skill"] = df["skill"].apply(original_skill_to_target_skill.get) + # Drop rows where skill is None + df = df[df["skill"].notna()].reset_index(drop=True).copy() + + grouped_df = df.groupby("sentence")["skill"].apply(list).reset_index() + + # Load ESCO skill vocabulary for target version/language + esco = ESCO(version=self.esco_version, language=language) + skill_vocab = esco.get_skills_vocabulary() + skill2label = {skill: i for i, skill in enumerate(skill_vocab)} + + # Filter skills that exist in vocabulary (Excludes "LABEL NOT PRESENT" and "UNDERSPECIFIED") + filtered_queries = [] + filtered_labels = [] + for query, skill_list in zip(grouped_df["sentence"], grouped_df["skill"], strict=True): + filtered_skill_list = [skill for skill in skill_list if skill in skill2label] + if len(filtered_skill_list) == 0: + continue + filtered_queries.append(query) + filtered_labels.append([skill2label[skill] for skill in filtered_skill_list]) + + return RankingDataset( + query_texts=filtered_queries, + target_indices=filtered_labels, + target_space=skill_vocab, + dataset_id=dataset_id, + ) + + @property + def name(self) -> str: + """Skill extraction SkillXL task name.""" + return "Skill Extraction SkillXL" + + @property + def description(self) -> str: + """Skill extraction SkillXL task description.""" + return ( + "Extract skills from text descriptions in a generic distribution of job descriptions." + ) + + @property + def citation(self) -> str: + """Skill extraction SkillXL task citation.""" + return """@ARTICLE{contextmatch_2025, + author={Decorte, Jens-Joris and van Hautte, Jeroen and Develder, Chris and Demeester, Thomas}, + journal={IEEE Access}, + title={Efficient Text Encoders for Labor Market Analysis}, + year={2025}, + volume={13}, + number={}, + pages={133596-133608}, + keywords={Taxonomy;Contrastive learning;Training;Annotations;Benchmark testing;Training data;Large language models;Computational efficiency;Accuracy;Terminology;Labor market analysis;text encoders;skill extraction;job title normalization}, + doi={10.1109/ACCESS.2025.3589147} +} +""" + + @register_task() class SkillSkapeExtractRanking(BaseESCOSkillExtractRanking): """Skill Extraction from SkillSkape Ranking Task."""