Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ pip install workrb
| Skill Extraction House | `HouseSkillExtractRanking` | multi_label | 262 queries x 13891 targets | 28 |
| Skill Extraction Tech | `TechSkillExtractRanking` | multi_label | 338 queries x 13891 targets | 28 |
| Skill Extraction SkillSkape | `SkillSkapeExtractRanking` | multi_label | 1191 queries x 13891 targets | 28 |
| Skill Extraction TechWolf | `TechWolfSkillExtractRanking` | multi_label | 326 queries x 13891 targets | 28 |
| Skill Extraction SkillXL | `SkillXLSkillExtractRanking` | multi_label | 944 queries x 13891 targets | 28 |
| Skill Similarity SkillMatch-1K | `SkillMatch1kSkillSimilarityRanking` | single_label | 900 queries x 2648 targets | 1 |
| Skill Normalization ESCO | `ESCOSkillNormRanking` | multi_label | 72008 queries x 13939 targets | 28 |
| Skill Normalization MELS | `MELSRanking` | multi_label | 1722 queries x 19466 targets | 5 |
Expand Down
4 changes: 4 additions & 0 deletions src/workrb/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
from .ranking.skill_extraction import (
HouseSkillExtractRanking,
SkillSkapeExtractRanking,
SkillXLSkillExtractRanking,
TechSkillExtractRanking,
TechWolfSkillExtractRanking,
)
from .ranking.skill_similarity import SkillMatch1kSkillSimilarityRanking
from .ranking.skillnorm import ESCOSkillNormRanking
Expand All @@ -46,7 +48,9 @@
"MELSRanking",
"HouseSkillExtractRanking",
"TechSkillExtractRanking",
"TechWolfSkillExtractRanking",
"SkillSkapeExtractRanking",
"SkillXLSkillExtractRanking",
"SkillMatch1kSkillSimilarityRanking",
"ProjectCandidateRanking",
"SearchQueryCandidateRanking",
Expand Down
4 changes: 4 additions & 0 deletions src/workrb/tasks/ranking/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
from workrb.tasks.ranking.skill_extraction import (
HouseSkillExtractRanking,
SkillSkapeExtractRanking,
SkillXLSkillExtractRanking,
TechSkillExtractRanking,
TechWolfSkillExtractRanking,
)
from workrb.tasks.ranking.skill_similarity import SkillMatch1kSkillSimilarityRanking
from workrb.tasks.ranking.skillnorm import ESCOSkillNormRanking
Expand All @@ -38,5 +40,7 @@
"SearchQueryCandidateRanking",
"SkillMatch1kSkillSimilarityRanking",
"SkillSkapeExtractRanking",
"SkillXLSkillExtractRanking",
"TechSkillExtractRanking",
"TechWolfSkillExtractRanking",
]
140 changes: 137 additions & 3 deletions src/workrb/tasks/ranking/skill_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,7 @@ def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset:
if uri in target_uris_to_skill:
original_skill_to_target_skill[orig_skill] = target_uris_to_skill[uri]

df["label"] = df["label"].apply(
lambda orig_skill: original_skill_to_target_skill.get(orig_skill)
)
df["label"] = df["label"].apply(original_skill_to_target_skill.get)
# Drop rows where label is None
df = df[df["label"].notna()].reset_index(drop=True).copy()

Expand Down Expand Up @@ -217,6 +215,142 @@ def citation(self) -> str:
"""


@register_task()
class TechWolfSkillExtractRanking(BaseESCOSkillExtractRanking):
"""Skill Extraction from TechWolf Dataset Ranking Task."""

orig_esco_version = "1.1.0"

def __init__(self, esco_version: str = "1.1.0", **kwargs):
self.esco_version = esco_version
super().__init__(hf_name="TechWolf/skill-extraction-techwolf", **kwargs)

@property
def name(self) -> str:
"""Skill extraction TechWolf task name."""
return "Skill Extraction TechWolf"

@property
def description(self) -> str:
"""Skill extraction TechWolf task description."""
return (
"Extract skills from text descriptions in a generic distribution of job descriptions."
)

@property
def citation(self) -> str:
"""Skill extraction TechWolf task citation."""
return """@article{decorte2023extreme,
title={Extreme multi-label skill extraction training using large language models},
author={Decorte, Jens-Joris and Verlinden, Severine and Van Hautte, Jeroen and Deleu, Johannes and Develder, Chris and Demeester, Thomas},
journal={arXiv preprint arXiv:2307.10778},
year={2023}
}
"""


@register_task()
class SkillXLSkillExtractRanking(BaseESCOSkillExtractRanking):
"""Skill Extraction from SkillXL Dataset Ranking Task."""

orig_esco_version = "1.1.0"

def __init__(self, esco_version: str = "1.1.0", **kwargs):
self.esco_version = esco_version
super().__init__(hf_name="TechWolf/Skill-XL", **kwargs)

def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset:
"""Load SkillXL data, filtering to relevant rows.

SkillXL differs from the base class in two ways: it includes
irrelevant sentences (filtered via the ``relevant`` boolean column),
and the skill column is named ``skill`` instead of ``label``.
"""
language = Language(dataset_id)
# Load data
split_names = {DatasetSplit.TEST: "test", DatasetSplit.VAL: "validation"}
dataset = load_dataset(self.hf_name, split=split_names[split])
assert isinstance(dataset, Dataset)
df = dataset.to_pandas()
assert isinstance(df, pd.DataFrame)
assert "relevant" in df.columns, "Expected 'relevant' column in the dataset."

# Only keep rows where relevant is True
df = df[df["relevant"]].reset_index(drop=True).copy()
assert isinstance(df, pd.DataFrame)

# If ESCO version is not 1.1.0 and / or language is not en, we need to translate the skills
if self.esco_version != self.orig_esco_version or language != Language.EN:
original_esco = ESCO(version=self.orig_esco_version, language=Language.EN)
original_skill_uris = original_esco.get_skills_uris()
original_uris_to_skill = {v: k for k, v in original_skill_uris.items()}

target_esco = ESCO(version=self.esco_version, language=language)
target_skill_uris = target_esco.get_skills_uris()
target_uris_to_skill = {v: k for k, v in target_skill_uris.items()}

original_skill_to_target_skill = {}
for uri, orig_skill in original_uris_to_skill.items():
if uri in target_uris_to_skill:
original_skill_to_target_skill[orig_skill] = target_uris_to_skill[uri]

df["skill"] = df["skill"].apply(original_skill_to_target_skill.get)
# Drop rows where skill is None
df = df[df["skill"].notna()].reset_index(drop=True).copy()

grouped_df = df.groupby("sentence")["skill"].apply(list).reset_index()

# Load ESCO skill vocabulary for target version/language
esco = ESCO(version=self.esco_version, language=language)
skill_vocab = esco.get_skills_vocabulary()
skill2label = {skill: i for i, skill in enumerate(skill_vocab)}

# Filter skills that exist in vocabulary (Excludes "LABEL NOT PRESENT" and "UNDERSPECIFIED")
filtered_queries = []
filtered_labels = []
for query, skill_list in zip(grouped_df["sentence"], grouped_df["skill"], strict=True):
filtered_skill_list = [skill for skill in skill_list if skill in skill2label]
if len(filtered_skill_list) == 0:
continue
filtered_queries.append(query)
filtered_labels.append([skill2label[skill] for skill in filtered_skill_list])

return RankingDataset(
query_texts=filtered_queries,
target_indices=filtered_labels,
target_space=skill_vocab,
dataset_id=dataset_id,
)

@property
def name(self) -> str:
"""Skill extraction SkillXL task name."""
return "Skill Extraction SkillXL"

@property
def description(self) -> str:
"""Skill extraction SkillXL task description."""
return (
"Extract skills from text descriptions in a generic distribution of job descriptions."
)

@property
def citation(self) -> str:
"""Skill extraction SkillXL task citation."""
return """@ARTICLE{contextmatch_2025,
author={Decorte, Jens-Joris and van Hautte, Jeroen and Develder, Chris and Demeester, Thomas},
journal={IEEE Access},
title={Efficient Text Encoders for Labor Market Analysis},
year={2025},
volume={13},
number={},
pages={133596-133608},
keywords={Taxonomy;Contrastive learning;Training;Annotations;Benchmark testing;Training data;Large language models;Computational efficiency;Accuracy;Terminology;Labor market analysis;text encoders;skill extraction;job title normalization},
doi={10.1109/ACCESS.2025.3589147}
}
"""


@register_task()
class SkillSkapeExtractRanking(BaseESCOSkillExtractRanking):
"""Skill Extraction from SkillSkape Ranking Task."""
Expand Down
Loading