Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to
[Semantic Versioning](https://semver.org/spec/v2.0.0.html).
---
## [0.1.6] - 2026-03-20

## Added
- Full in-code `SOCmeta` map from SOC 2020 Volume 1 (~550 entries).
- Layered SOC lookup metadata fields (minor, sub-major, and major group).

## Changed
- `SocMeta.get_meta_by_code()` now requires an exact code match; silent parent-group fallback removed.
- `SOCLookup` and `soc_hierarchy` use exact metadata lookup throughout.

---
## [0.1.3] - 2025-07-08

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "soc-classification-library"
version = "0.1.5"
version = "0.1.6"
description = "Standard Occupational Classification library"
authors = ["Steve Gibbard <steve.gibbard@ons.gov.uk>"]
license = "MIT"
Expand Down
19 changes: 12 additions & 7 deletions src/occupational_classification/hierarchy/soc_hierarchy.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,11 @@ def _define_codes_and_nodes(soc_df: pd.DataFrame):
code_node_dict = {}

for code in soc_df["code"]:
group_description = soc_meta.get_meta_by_code(code)["group_description"]
group_title = soc_meta.get_meta_by_code(code)["group_title"]
meta = soc_meta.get_meta_by_code_exact(code)
group_description = (
meta.get("group_description", "") if "error" not in meta else ""
)
group_title = meta.get("group_title", "") if "error" not in meta else ""
soc_node = SocNode(
code, group_title=group_title, group_description=group_description
)
Expand All @@ -270,11 +273,13 @@ def _populate_tasks_and_quals(nodes: list):
for node in nodes:
code = node.soc_code
if SocCode(code).code_length() == _SOC_CODE_LENGTH:
qual = soc_meta.get_meta_by_code(code)["entry_routes_and_quals"]
node.qualifications = qual

tasks_list = soc_meta.get_meta_by_code(code)["tasks"]
node.tasks = tasks_list
meta = soc_meta.get_meta_by_code_exact(code)
if "error" in meta:
node.qualifications = ""

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also retrieve the description into the node metadata?
(we may choose not to do it as part of this PR, because at some point we will want to load all this meta from xls instead anyway)

@dstewartons dstewartons Jun 23, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes group_description is already populated on hierarchy nodes from SocMeta

node.tasks = []
else:
node.qualifications = meta.get("entry_routes_and_quals", "")
node.tasks = meta.get("tasks", [])


def _populate_job_titles(nodes: list, soc_index: pd.DataFrame):
Expand Down
15 changes: 10 additions & 5 deletions src/occupational_classification/lookup/soc_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,9 @@ def lookup( # pylint: disable=too-many-locals
matching_code_sub_major_group = matching_code[:2]
matching_code_major_group = matching_code[:1]
if self.meta is not None:
matching_code_meta = self.meta.get_meta_by_code(matching_code)
matching_code_meta = self._normalise_meta(matching_code_meta)
matching_code_meta = self._normalise_meta(
self.meta.get_meta_by_code_exact(matching_code)
)
if matching_code_minor_group is not None:
minor_group_meta = self._normalise_meta(
self.meta.get_meta_by_code_exact(matching_code_minor_group)
Expand All @@ -132,7 +133,7 @@ def lookup( # pylint: disable=too-many-locals
self.meta.get_meta_by_code_exact(matching_code_sub_major_group)
)
major_group_meta = self._normalise_meta(
self.meta.get_meta_by_code(matching_code_major_group)
self.meta.get_meta_by_code_exact(matching_code_major_group)
)

if not matching_code:
Expand All @@ -156,7 +157,9 @@ def lookup( # pylint: disable=too-many-locals
major_groups = [
{
"code": major_group_code,
"meta": self.meta.get_meta_by_code(major_group_code),
"meta": self._normalise_meta(
self.meta.get_meta_by_code_exact(major_group_code)
),
}
for major_group_code in major_group_codes
]
Expand Down Expand Up @@ -199,7 +202,9 @@ def lookup_code_major_group(
matching_code_major_group: Optional[str] = code[:1] if code else None
major_group_meta: Optional[dict[str, Any]] = None
if self.meta is not None and matching_code_major_group is not None:
major_group_meta = self.meta.get_meta_by_code(matching_code_major_group)
major_group_meta = self._normalise_meta(
self.meta.get_meta_by_code_exact(matching_code_major_group)
)
return {
"code_major_group": matching_code_major_group,
"code_major_group_meta": major_group_meta,
Expand Down
33 changes: 6 additions & 27 deletions src/occupational_classification/meta/soc_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -4171,39 +4171,18 @@ def __init__(self):
self.soc_meta = SOCmeta

def get_meta_by_code(self, code: str) -> dict:
"""Retrieve metadata for a given SOC code with parent fallback."""
entry = self.soc_meta.get(code)
if entry is not None:
return {
"code": code,
"group_title": entry.get("group_title", ""),
"group_description": entry.get("group_description", ""),
"entry_routes_and_quals": entry.get("entry_routes_and_quals", ""),
"tasks": entry.get("tasks", []),
}
lookup = code[:-1]
while lookup:
entry = self.soc_meta.get(lookup)
if entry is not None:
return {
"code": lookup,
"group_title": entry.get("group_title", ""),
"group_description": entry.get("group_description", ""),
"entry_routes_and_quals": entry.get("entry_routes_and_quals", ""),
"tasks": entry.get("tasks", []),
}
lookup = lookup[:-1]
return {"error": f"No metadata found for SOC code {code}"}

def get_meta_by_code_exact(self, code: str) -> dict:
"""Retrieve metadata only for an exact SOC code match."""
"""Retrieve metadata for an exact SOC code match only."""
entry = self.soc_meta.get(code)
if entry is None:
return {"error": f"No exact metadata found for SOC code {code}"}
return {"error": f"No metadata found for SOC code {code}"}
return {
"code": code,
"group_title": entry.get("group_title", ""),
"group_description": entry.get("group_description", ""),
"entry_routes_and_quals": entry.get("entry_routes_and_quals", ""),
"tasks": entry.get("tasks", []),
}

def get_meta_by_code_exact(self, code: str) -> dict:
"""Retrieve metadata only for an exact SOC code match."""
return self.get_meta_by_code(code)
20 changes: 20 additions & 0 deletions tests/test_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,26 @@ def test_soc_lookup_default_path_uses_example_csv():
result = lookup.lookup("chief executives and senior officials")
assert result["code"] == "1111"
assert result["code_major_group"] == "1"
assert result["code_meta"] is not None
assert result["code_meta"]["code"] == "1111"


def test_lookup_returns_null_code_meta_when_metadata_missing(tmp_path):
"""Lookup returns null code_meta when the matched code has no SOC metadata."""
data = pd.DataFrame(
{
"description": ["unknown occupation"],
"label": ["9999"],
}
)
file_path = tmp_path / "mock_soc_data_missing_meta.csv"
data.to_csv(file_path, index=False)

lookup = SOCLookup(data_path=str(file_path))
result = lookup.lookup("unknown occupation")

assert result["code"] == "9999"
assert result["code_meta"] is None


def test_soc_lookup_rejects_non_csv_path(tmp_path):
Expand Down
1 change: 1 addition & 0 deletions tests/test_soc_lookup_example_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def test_soc_lookup_example_exact_match():
assert result["description"] == "chief executives and senior officials"
# With always-on SocMeta, metadata should be present for the example code
assert result["code_meta"] is not None
assert result["code_meta"]["code"] == "1111"
assert result["code_major_group"] == "1"
assert result["code_major_group_meta"] is not None

Expand Down
23 changes: 23 additions & 0 deletions tests/test_soc_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Tests for SocMeta exact-match behaviour."""

from occupational_classification.meta.soc_meta import SocMeta


def test_get_meta_by_code_returns_exact_unit_metadata():
"""get_meta_by_code returns unit-level metadata for a known SOC code."""
meta = SocMeta().get_meta_by_code("1111")
assert "error" not in meta
assert meta["code"] == "1111"
assert meta["group_title"] == "Chief executives and senior officials"


def test_get_meta_by_code_does_not_fallback_to_parent_group():
"""get_meta_by_code returns an error for unknown codes instead of parent fallback."""
meta = SocMeta().get_meta_by_code("9999")
assert meta == {"error": "No metadata found for SOC code 9999"}


def test_get_meta_by_code_exact_matches_get_meta_by_code():
"""get_meta_by_code_exact delegates to get_meta_by_code with the same result."""
soc_meta = SocMeta()
assert soc_meta.get_meta_by_code_exact("2112") == soc_meta.get_meta_by_code("2112")
Loading