Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions notebooks/soc_2025_05_01.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)
from occupational_classification.hierarchy import soc_hierarchy
from occupational_classification.lookup.soc_lookup import SOCLookup, SOCRephraseLookup
from occupational_classification.meta.soc_meta import SOC_META, SocMeta
from occupational_classification.meta.soc_meta import SOCmeta, SocMeta

# %% [markdown]
# ### Read the data from files using get_config()
Expand Down Expand Up @@ -132,16 +132,16 @@
soc_meta = SocMeta()

# %%
SOC_META["3"]
SOCmeta["3"]

# %% [markdown]
# Possible to go through code, soc2020_group_title, group_description, qualifications, and tasks.

# %%
SOC_META["3"].get("group_description")
SOCmeta["3"].get("group_description")

# %%
SOC_META["3"]
SOCmeta["3"]

# %%
soc_lookup.meta.get_meta_by_code("2431")
Expand Down
45 changes: 37 additions & 8 deletions src/occupational_classification/lookup/soc_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@ def _normalise_lookup_dataframe(data: pd.DataFrame) -> pd.DataFrame:
)
out = data[[text_col, code_col]].copy()
out = out.rename(columns={text_col: "description", code_col: "label"})
out["description"] = out["description"].astype(str).str.strip().str.lower()
out["label"] = out["label"].astype(str).str.strip()
# Match SICLookup: lower-case descriptions only (no strip on load or lookup).
out["description"] = out["description"].astype(str).str.lower()
out["label"] = out["label"].astype(str)
out = out.dropna(subset=["description", "label"])
return out

Expand Down Expand Up @@ -82,7 +83,16 @@ def __init__(
"label"
]

def lookup(self, description: str, similarity: bool = False) -> dict[str, Any]:
@staticmethod
def _normalise_meta(meta: dict[str, Any]) -> Optional[dict[str, Any]]:
"""Return metadata dict or None when lookup reports an error."""
if "error" in meta:
return None
return meta

def lookup( # pylint: disable=too-many-locals
self, description: str, similarity: bool = False
) -> dict[str, Any]:
"""Looks up an SOC code based on the given description.

Args:
Expand All @@ -97,17 +107,33 @@ def lookup(self, description: str, similarity: bool = False) -> dict[str, Any]:

matching_code: Optional[str] = self.lookup_dict.get(description)
matching_code_meta: Optional[dict[str, Any]] = None
minor_group_meta: Optional[dict[str, Any]] = None
sub_major_group_meta: Optional[dict[str, Any]] = None
major_group_meta: Optional[dict[str, Any]] = None

# Extract the first digit of the code as code_major_group
# Extract SOC hierarchy segments.
matching_code_minor_group: Optional[str] = None
matching_code_sub_major_group: Optional[str] = None
matching_code_major_group: Optional[str] = None
if matching_code:
if len(matching_code) == UNIT_CODE_LEN:
matching_code_minor_group = matching_code[:3]
matching_code_sub_major_group = matching_code[:2]
matching_code_major_group = matching_code[:1]
if self.meta is not None:
matching_code_meta = self.meta.get_meta_by_code(matching_code)
if "error" in matching_code_meta:
matching_code_meta = None
major_group_meta = self.meta.get_meta_by_code(matching_code_major_group)
matching_code_meta = self._normalise_meta(matching_code_meta)
if matching_code_minor_group is not None:
minor_group_meta = self._normalise_meta(
self.meta.get_meta_by_code_exact(matching_code_minor_group)
)
if matching_code_sub_major_group is not None:
sub_major_group_meta = self._normalise_meta(
self.meta.get_meta_by_code_exact(matching_code_sub_major_group)
)
major_group_meta = self._normalise_meta(
self.meta.get_meta_by_code(matching_code_major_group)
)

if not matching_code:
matching_code = None
Expand Down Expand Up @@ -149,6 +175,10 @@ def lookup(self, description: str, similarity: bool = False) -> dict[str, Any]:
"description": description,
"code": matching_code,
"code_meta": matching_code_meta,
"code_minor_group": matching_code_minor_group,
"code_minor_group_meta": minor_group_meta,
"code_sub_major_group": matching_code_sub_major_group,
"code_sub_major_group_meta": sub_major_group_meta,
"code_major_group": matching_code_major_group,
"code_major_group_meta": major_group_meta,
}
Expand Down Expand Up @@ -273,4 +303,3 @@ def process_json(self, input_json: dict[str, Any]) -> dict[str, Any]:
]

return input_json

Loading
Loading