From 0ffe78232f91c18bc9feea8ad3f8567c659f32e1 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Mon, 11 May 2026 12:28:07 +0000 Subject: [PATCH 01/24] Add prompt and validation model for initial SOC classificaiton --- .../llm/prompt.py | 50 ++++++++++++++ .../models/response_model.py | 68 +++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py index af71a4f..f7ca4d1 100644 --- a/src/occupational_classification_utils/llm/prompt.py +++ b/src/occupational_classification_utils/llm/prompt.py @@ -30,6 +30,7 @@ from occupational_classification_utils.models.response_model import ( RagResponse, SocResponse, + UnambiguousResponse, ) config = get_config() @@ -150,6 +151,55 @@ }, ) + +_soc_template_unambiguous = """"You are an expert in ocucpational classifications. +You are tasked with determining whether a survey response can be assigned to a +single 4-digit UK Standard Occupational Classification (SOC) code based on initial respondent data alone. + +Key objective: Determine if the response can be coded unambiguously to a single 4-digit SOC code. + +Assignment logic: +1. Code as unambiguous when response can be coded to a single 4-digit SOC code with 99 +per cent confidence based on available evidence. +2. Code as uncodable to 4-digit when multiple candidates are plausible and +additional information is needed to distinguish between them. + +===Analysis steps=== +Follow these steps in order: +1. Review each candidate from the shortlist of relevant SOC codes against the respondent data. +2. Assess alignment - Consider: + - Semantic similarity between respondent descriptions and SOC code descriptions + - Job role compatibility with typical activities in each SOC code + - Industry context alignment + - Matches with specific examples listed under each code. +3. Assign confidence scores - Rate each candidate from 0.1 (least likely) to 0.9 (most likely). +4. Decide if response can be codeded unambiguously to a single 4-digit SOC code with 99 per cent confidence. +5. Provide reasoning for your decision. + +===Respondent Data=== +- Industry description: {industry_descr} +- Job Title: {job_title} +- Job Description: {job_description} +- Level of Education: {level_of_education} + +===Shortlist=== +{soc_candidates} + +===Output Format=== +{format_instructions} +""" + +parser_unambiguous = PydanticOutputParser( # type: ignore # Suspect langchain ver bug + pydantic_object=UnambiguousResponse +) + +SOC_PROMPT_UNAMBIGUOUS = PromptTemplate.from_template( + template=_core_prompt + _soc_template_unambiguous, + partial_variables={ + "format_instructions": parser_unambiguous.get_format_instructions(), + }, +) + FIX_PARSING_PROMPT = PromptTemplate.from_template( """You are a meticulous assistant tasked with ensuring that the output from a language model adheres strictly to the required JSON format. diff --git a/src/occupational_classification_utils/models/response_model.py b/src/occupational_classification_utils/models/response_model.py index 87da5e6..994f765 100644 --- a/src/occupational_classification_utils/models/response_model.py +++ b/src/occupational_classification_utils/models/response_model.py @@ -271,3 +271,71 @@ class SurveyAssistSocResponse(BaseModel): selected. Specifies the information used to assign the SOC code or any additional information required to assign a SOC code.""", ) + + +class UnambiguousResponse(BaseModel): + """Represents a response model for classification code assignment. + + Attributes: + codable (bool): True only if enough information is provided to assign + an unambiguous single classification code, False otherwise. + class_code (Optional[str]): Full classification code (to the required number of digits) + assigned based on provided respondent's data. Must be present if codable=True, + must be None if codable=False. + class_descriptive (Optional[str]): Descriptive label of the classification category. + Must be present if codable=True, must be None if codable=False. + alt_candidates (list[RagCandidate]): Short list of possible classification codes with their + descriptive labels and estimated likelihoods. + reasoning (str): Step by step reasoning behind the classification selected. + """ + + codable: bool = Field( + description="True only if enough information is provided to decide an unambiguous " + "classification code, False otherwise." + ) + + class_code: Optional[str] = Field( + default=None, + description="Full classification code (to the required number of digits) " + "assigned based on provided respondent's data. Must be present if codable=True, " + "must be None if codable=False.", + ) + + class_descriptive: Optional[str] = Field( + default=None, + description="Descriptive label of the classification category. " + "Must be present if codable=True, must be None if codable=False.", + ) + + alt_candidates: list[RagCandidate] = Field( + default_factory=list, + description="Short list of possible classification codes with their " + "descriptive labels and estimated likelihoods.", + min_length=1, # Ensure there's always at least one candidate + max_length=10, # Limit to less than 10 candidates + ) + + reasoning: str = Field( + description="Step by step reasoning behind the classification selected.", + min_length=50, # Ensure detailed reasoning is provided + ) + + @field_validator("alt_candidates") + @classmethod + def validate_alt_candidates(cls, v): + """Validates the number of alternative candidates. + + Ensures that the number of candidates is between 1 and the maximum allowed. + + Args: + v (list): The list of alternative candidates. + + Returns: + list: The validated list of candidates. + + Raises: + ValueError: If the number of candidates is not within the allowed range. + """ + if not 1 <= len(v) <= MAX_ALT_CANDIDATES: + raise ValueError("alt_candidates must contain between 1 and 10 items.") + return v \ No newline at end of file From 1db4e8197838aeb859a055127708389d783e9307 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Mon, 11 May 2026 15:07:43 +0000 Subject: [PATCH 02/24] add llm method for initial SOC classification --- .../llm/llm.py | 169 +++++++++++++++++- 1 file changed, 168 insertions(+), 1 deletion(-) diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py index 8a93597..f809d1e 100644 --- a/src/occupational_classification_utils/llm/llm.py +++ b/src/occupational_classification_utils/llm/llm.py @@ -15,6 +15,7 @@ (None at the module level) """ +import time from collections import defaultdict from functools import lru_cache from typing import Any, Optional, Union @@ -37,8 +38,9 @@ FIX_PARSING_PROMPT, SA_SOC_PROMPT_RAG, SOC_PROMPT_PYDANTIC, + SOC_PROMPT_UNAMBIGUOUS, ) -from occupational_classification_utils.models.response_model import SocResponse +from occupational_classification_utils.models.response_model import SocResponse, UnambiguousResponse logger = get_logger(__name__) config = get_config() @@ -103,6 +105,7 @@ def __init__( # noqa: PLR0913 self.soc_meta = get_soc_meta(config["lookups"]["soc_structure"]) self.soc_prompt = SOC_PROMPT_PYDANTIC self.sa_soc_prompt_rag = SA_SOC_PROMPT_RAG + self.soc_prompt_unambiguous = SOC_PROMPT_UNAMBIGUOUS self.soc: Optional[SOC] = None self.verbose = verbose @@ -399,3 +402,167 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes): ) return validated_answer, short_list, call_dict + + + async def unambiguous_soc_code( # noqa: PLR0913 + self, + industry_descr: str, + semantic_search_results: list[dict], + job_title: Optional[str] = None, + job_description: Optional[str] = None, + level_of_education: Optional[str] = None, + candidates_limit: int = config["llm"]["candidates_limit"], + code_digits: int = config["llm"]["code_digits"], + correlation_id: Optional[str] = None, + ) -> tuple[UnambiguousResponse, Optional[Any]]: + """Evaluates codability to a single 4-digit SOC code based on respondent's data. + + Args: + industry_descr (str): The description of the industry. + semantic_search_results (list of dicts): List of semantic search results. + job_title (str, optional): The job title. Defaults to None. + job_description (str, optional): The job description. Defaults to None. + level_of_education (str, optional): The level od education. Defaults to None. + candidates_limit (int, optional): The maximum number of candidates + to include in the prompt. Defaults to 5. + code_digits (int, optional): The number of digits to consider from + the code for filtering candidates. Defaults to 5. + correlation_id (str, optional): Optional correlation ID for request tracking. + + Returns: + UnambiguousResponse: The generated response to the query. + + Raises: + ValueError: If there is an error during the parsing of the response. + ValueError: If the default embedding handler is required but + not loaded correctly. + + """ + soc_candidates = self._prompt_candidate_list( + short_list=semantic_search_results, + code_digits=code_digits, + candidates_limit=candidates_limit, + ) + + job_title = ( + "Unknown" if (job_title is None or job_title in {"", " "}) else job_title + ) + job_description = ( + "Unknown" + if (job_description is None or job_description in {"", " "}) + else job_description + ) + level_of_education = ( + "Unknown" if (level_of_education is None or level_of_education in {"", " "}) else level_of_education + ) + + call_dict = { + "industry_descr": industry_descr, + "job_title": job_title, + "job_description": job_description, + "level_of_education": level_of_education, + "soc_candidates": soc_candidates, + } + + if self.verbose: + final_prompt = self.soc_prompt_unambiguous.format(**call_dict) + logger.debug(final_prompt) + + chain = self.soc_prompt_unambiguous | self.llm + + # Log LLM request sent # Not logging yet - needs to create/import truncate_identifier. + # logger.info( + # "LLM request sent - unambiguous_sic_code", + # job_title=truncate_identifier(job_title), + # job_description=truncate_identifier(job_description), + # level_of_education=truncate_identifier(level_of_education), + # industry_descr=truncate_identifier(industry_descr), + # correlation_id=correlation_id or "", + # ) + llm_start = time.perf_counter() + + try: + response = await chain.ainvoke(call_dict, return_only_outputs=True) + except ValueError as err: + logger.error( + f"Error from chain, exit early: {err}", + error=str(err), + correlation_id=correlation_id or "", + ) + validated_answer = UnambiguousResponse( + codable=False, + alt_candidates=[], + reasoning="Error from chain, exit early", + ) + return validated_answer, call_dict + + if self.verbose: + logger.debug(f"llm_response={response}") + + # Parse the output to the desired format + parser = PydanticOutputParser(pydantic_object=UnambiguousResponse) # type: ignore + try: + validated_answer = parser.parse(str(response.content)) + # Log LLM response received after successful parse + alt_candidates_count = len( + getattr(validated_answer, "alt_candidates", []) or [] + ) + codable = bool(getattr(validated_answer, "codable", False)) + selected_code = ( + str(getattr(validated_answer, "class_code", "")) if codable else "" + ) + llm_duration_ms = int((time.perf_counter() - llm_start) * 1000) + logger.info( + "LLM response received for unambiguous sic prompt", + codable=str(codable), + selected_code=selected_code, + alt_candidates_count=str(alt_candidates_count), + duration_ms=str(llm_duration_ms), + correlation_id=correlation_id or "", + ) + except (ValueError, AttributeError) as parse_error: + logger.error( + f"Failed to parse response: {parse_error}", + error=str(parse_error), + correlation_id=correlation_id or "", + ) + llm_duration_ms = int((time.perf_counter() - llm_start) * 1000) + logger.warning( + "Failed to parse response", + response_content=str(response.content), + duration_ms=str(llm_duration_ms), + correlation_id=correlation_id or "", + ) + + # send another llm request to fix the format (1 attempt) + try: + chain = FIX_PARSING_PROMPT | self.llm + response = await chain.ainvoke( + { + "llm_output": str(response.content), + "format_instructions": parser.get_format_instructions(), + }, + return_only_outputs=True, + ) + validated_answer = parser.parse(str(response.content)) + logger.debug("Successfully parsed reformatted response.") + + except (ValueError, AttributeError) as parse_error2: + logger.error( + f"Failed to parse response again: {parse_error2}", + error=str(parse_error2), + ) + logger.warning( + "Failed to parse response again", + response_content=str(response.content), + ) + reasoning = ( + f"ERROR parse_error=<{parse_error2}>, response=<{response.content}>" + ) + validated_answer = UnambiguousResponse( + codable=False, + alt_candidates=[], + reasoning=reasoning, + ) + + return validated_answer, call_dict \ No newline at end of file From b54238d0fc46810b96c7fbd5e93b291957671f70 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Tue, 12 May 2026 15:29:59 +0000 Subject: [PATCH 03/24] add llm method, prompt and validation for stage 3 --- .../llm/llm.py | 164 ++++++++++++++++++ .../llm/prompt.py | 68 ++++++++ .../models/response_model.py | 24 ++- 3 files changed, 255 insertions(+), 1 deletion(-) diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py index f809d1e..6110f53 100644 --- a/src/occupational_classification_utils/llm/llm.py +++ b/src/occupational_classification_utils/llm/llm.py @@ -39,6 +39,7 @@ SA_SOC_PROMPT_RAG, SOC_PROMPT_PYDANTIC, SOC_PROMPT_UNAMBIGUOUS, + SOC_PROMPT_OPENFOLLOWUP, ) from occupational_classification_utils.models.response_model import SocResponse, UnambiguousResponse @@ -106,6 +107,7 @@ def __init__( # noqa: PLR0913 self.soc_prompt = SOC_PROMPT_PYDANTIC self.sa_soc_prompt_rag = SA_SOC_PROMPT_RAG self.soc_prompt_unambiguous = SOC_PROMPT_UNAMBIGUOUS + self.soc_prompt_openfollowup = SOC_PROMPT_OPENFOLLOWUP self.soc: Optional[SOC] = None self.verbose = verbose @@ -565,4 +567,166 @@ async def unambiguous_soc_code( # noqa: PLR0913 reasoning=reasoning, ) + return validated_answer, call_dict + + async def formulate_open_question( + self, + industry_descr: str, + job_title: str | None = None, + job_description: str | None = None, + level_of_education: str | None = None, + llm_output: SicCandidate | None = None, + correlation_id: str | None = None, + ) -> tuple[OpenFollowUp, Any]: + """Formulates an open-ended question using respondent data and survey design guidelines. + + Args: + industry_descr (str): The description of the industry. + job_title (str, optional): The job title. Defaults to None. + job_description (str, optional): The job description. Defaults to None. + level_of_education (str, optional): The level od education. Defaults to None. + llm_output (SicCandidate, optional): The response from the LLM model. + correlation_id (str, optional): Optional correlation ID for request tracking. + + Returns: + OpenFollowUp: The generated response to the query. + + Raises: + ValueError: If there is an error during the parsing of the response. + ValueError: If the default embedding handler is required but + not loaded correctly. + + """ + + def prep_call_dict(industry_descr, job_title, job_description, level_of_education, llm_output): + # Helper function to prepare the call dictionary + is_job_title_present = job_title is None or job_title in {"", " "} + job_title = "Unknown" if is_job_title_present else job_title + + is_job_description_present = job_description is None or job_description in { + "", + " ", + } + job_description = ( + "Unknown" if is_job_description_present else job_description + ) + level_of_education = ( + "Unknown" if (level_of_education is None or level_of_education in {"", " "}) else level_of_education + ) + + call_dict = { + "industry_descr": industry_descr, + "job_title": job_title, + "job_description": job_description, + "level_of_education": level_of_education, + "llm_output": str(llm_output), + } + return call_dict + + call_dict = prep_call_dict( + industry_descr=industry_descr, + job_title=job_title, + job_description=job_description, + level_of_education=level_of_education, + llm_output=llm_output, + ) + + if self.verbose: + final_prompt = self.soc_prompt_openfollowup.format(**call_dict) + logger.debug(final_prompt) + + chain = self.soc_prompt_openfollowup | self.llm + + # Log LLM request sent # Not logging yet - needs to create/import truncate_identifier. + # logger.info( + # "LLM request sent - formulate_open_question", + # job_title=truncate_identifier(job_title), + # job_description=truncate_identifier(job_description), + # level_of_education=truncate_identifier(level_of_education), + # industry_descr=truncate_identifier(industry_descr), + # correlation_id=correlation_id or "", + # ) + llm_start = time.perf_counter() + + try: + response = await chain.ainvoke(call_dict, return_only_outputs=True) + except (ValueError, AttributeError) as err: + logger.error( + f"Error from LLMChain, exit early: {err}", + error=str(err), + correlation_id=correlation_id or "", + ) + logger.warning( + "Error from LLMChain, exit early", + correlation_id=correlation_id or "", + ) + validated_answer = OpenFollowUp( + followup=None, + reasoning="Error from LLMChain, exit early", + ) + return validated_answer, call_dict + + llm_duration_ms = int((time.perf_counter() - llm_start) * 1000) + + # Parse the output to the desired format + parser = PydanticOutputParser(pydantic_object=OpenFollowUp) + try: + validated_answer = parser.parse(str(response.content)) + # Log LLM response received after successful parse + has_followup = bool(getattr(validated_answer, "followup", None)) + logger.info( + "LLM response received for open question prompt", + has_followup=str(has_followup), + duration_ms=str(llm_duration_ms), + correlation_id=correlation_id or "", + ) + except (ValueError, AttributeError) as parse_error: + logger.error( + f"Failed to parse response: {parse_error}", + error=str(parse_error), + correlation_id=correlation_id or "", + ) + logger.warning( + "Failed to parse response", + response_content=str(response.content), + correlation_id=correlation_id or "", + ) + logger.info( + "LLM response received for open question prompt", + has_followup="False", + duration_ms=str(llm_duration_ms), + correlation_id=correlation_id or "", + ) + try: + chain = FIX_PARSING_PROMPT | self.llm + response = await chain.ainvoke( + { + "llm_output": str(response.content), + "format_instructions": parser.get_format_instructions(), + }, + return_only_outputs=True, + ) + validated_answer = parser.parse(str(response.content)) + logger.debug("Successfully parsed reformatted response.") + + except (ValueError, AttributeError) as parse_error2: + logger.error( + f"Failed to parse response again: {parse_error2}", + error=str(parse_error2), + ) + logger.warning( + "Failed to parse response again", + response_content=str(response.content), + ) + reasoning = ( + f"ERROR parse_error=<{parse_error2}>, response=<{response.content}>" + ) + validated_answer = OpenFollowUp( + followup=None, + reasoning=reasoning, + ) + + if self.verbose: + logger.debug(f"{response=}") + return validated_answer, call_dict \ No newline at end of file diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py index f7ca4d1..b813541 100644 --- a/src/occupational_classification_utils/llm/prompt.py +++ b/src/occupational_classification_utils/llm/prompt.py @@ -31,6 +31,7 @@ RagResponse, SocResponse, UnambiguousResponse, + OpenFollowUp, ) config = get_config() @@ -214,3 +215,70 @@ {format_instructions} """ ) + + +_open_follow_up = """"You are an expert survey methodologist specialising in + UK industrial classification (UK SOC). Generate one open-ended follow-up question + to help assign the most relevant UK SOC code. + +Objective +- Produce exactly one question that elicits the key information needed to distinguish + between the shortlisted SOC candidates, focusing on the employer's main business activity. + +Inputs +- Respondent data: +- Company's main activity: {industry_descr} +- Job title: {job_title} +- Job description: {job_description} +- Level of Education: {level_of_education} +- Shortlist from previous model: {llm_output} +- Note: These are candidate SOC categories; do not mention codes or "SOC" to the respondent. + +How to decide what to ask +- Identify the smallest, most informative difference among the candidates and target that with a single question. +- Prioritise discriminators in this order: +1) Stage in the value chain (e.g., manufacture/processing vs wholesale vs retail vs repair/installation vs + rental/leasing vs publishing/software vs consultancy/training). +2) Main product or service category (what goods/services the employer mainly provides). +3) Main customer type (households vs businesses vs government/health/education). +4) Delivery mode or setting (on-site vs online; physical goods vs digital; own-brand vs third-party). +- Ask about only one discriminator—the one most likely to resolve the ambiguity. + +Quality standards +- Language and clarity: + - Use plain British English; avoid or define jargon and abbreviations. + - Keep the single question concise (max 25 words), grammatically correct, and neutral. + - Use "employer" for for-profit; use "organisation" for non-profits, charities, public bodies, and education. + Default to "employer", if ambiguous. + - Refer to the present situation (e.g., "currently", "main"). + - Do not mention SOC or any code numbers. + - Do not ask for company names, client names, or other personal/sensitive data. +- Question structure: + - Start with "What", "How", "Which", or "Where". + - Focus on the employer's main business activities, products, or services—not the respondent's personal tasks. + - One issue per question; no A/B or either/or phrasing; avoid binary questions. + - Limit to one sentence ending with a question mark. + - You may add one additional sentence with broad, non-leading examples covering a wide range of options; + omit examples if they would be leading. +- Respondent considerations: + - Make it easy to answer in a few words. + - Ask only what a typical employee would reasonably know. + - Avoid requiring calculations or percentages. + +Edge cases +- If the shortlist is empty or clearly points to one category, ask a general clarifying question about + the main product/service or value-chain stage to confirm classification. +- Do not output explanations or reasoning; only the formatted result. + +Output format +- Return output that strictly follows: +{format_instructions} +""" +parser_followup_open = PydanticOutputParser(pydantic_object=OpenFollowUp) + +SOC_PROMPT_OPENFOLLOWUP = PromptTemplate.from_template( + template=_core_prompt + _open_follow_up, + partial_variables={ + "format_instructions": parser_followup_open.get_format_instructions(), + }, +) diff --git a/src/occupational_classification_utils/models/response_model.py b/src/occupational_classification_utils/models/response_model.py index 994f765..fbb48a7 100644 --- a/src/occupational_classification_utils/models/response_model.py +++ b/src/occupational_classification_utils/models/response_model.py @@ -338,4 +338,26 @@ def validate_alt_candidates(cls, v): """ if not 1 <= len(v) <= MAX_ALT_CANDIDATES: raise ValueError("alt_candidates must contain between 1 and 10 items.") - return v \ No newline at end of file + return v + + +class OpenFollowUp(BaseModel): + """Represents a response model for open ended follow-up question. + + Attributes: + followup (str): Question to ask user in order to collect + additional information to enable reliable classification assignment. + reasoning (str): Reasoning explaining how follow-up question will help + assign classification code. + """ + + followup: str | None = Field( + description="""Question to ask user in order to collect additional information + to enable reliable classification assignment.""", + default="", + ) + reasoning: str = Field( + description="""Reasoning explaining how follow-up question will help + assign classification code.""", + default="", + ) \ No newline at end of file From 0a5b90736dd955b9dd3c95662cf92b29818bcd99 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Fri, 29 May 2026 13:15:59 +0000 Subject: [PATCH 04/24] reflect types as in ruff --- .../llm/llm.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py index 2025fa3..3fdbb02 100644 --- a/src/occupational_classification_utils/llm/llm.py +++ b/src/occupational_classification_utils/llm/llm.py @@ -18,7 +18,7 @@ import time from collections import defaultdict from functools import lru_cache -from typing import Any, Optional, Union +from typing import Any import numpy as np from langchain.output_parsers import PydanticOutputParser @@ -75,11 +75,11 @@ class ClassificationLLM: def __init__( # noqa: PLR0913 self, model_name: str = DEFAULT_LLM_MODEL, - llm: Optional[Union[ChatVertexAI, ChatOpenAI]] = None, + llm: ChatVertexAI | ChatOpenAI | None = None, max_tokens: int = 1600, temperature: float = 0.0, verbose: bool = True, - openai_api_key: Optional[SecretStr] = None, + openai_api_key: SecretStr | None = None, ): """Initialises the ClassificationLLM object.""" logger.info( @@ -113,7 +113,7 @@ def __init__( # noqa: PLR0913 self.sa_soc_prompt_rag = SA_SOC_PROMPT_RAG self.soc_prompt_unambiguous = SOC_PROMPT_UNAMBIGUOUS self.soc_prompt_openfollowup = SOC_PROMPT_OPENFOLLOWUP - self.soc: Optional[SOC] = None + self.soc: SOC | None = None self.verbose = verbose @lru_cache # noqa: B019 @@ -207,7 +207,7 @@ def _prompt_candidate( def _prompt_candidate_list( self, - short_list: Union[list[dict], list[tuple[Document, float]]], # list[dict], + short_list: list[dict] | list[tuple[Document, float]], # list[dict], chars_limit: int = 14000, candidates_limit: int = 5, titles_limit: int = 3, @@ -269,13 +269,13 @@ def _prompt_candidate_list( async def sa_rag_soc_code( # noqa: PLR0913 self, industry_descr: str, - job_title: Optional[str] = None, - job_description: Optional[str] = None, + job_title: str | None = None, + job_description: str | None = None, expand_search_terms: bool = True, code_digits: int = 4, candidates_limit: int = 5, - short_list: Optional[list[dict[Any, Any]]] = None, - ) -> tuple[SocResponse, Optional[list[dict[Any, Any]]], Optional[Any]]: + short_list: list[dict[Any, Any]] | None = None, + ) -> tuple[SocResponse, list[dict[Any, Any]] | None, Any | None]: """Generates a SOC classification based on respondent's data using RAG approach. Caller must provide short_list (e.g. from vector store API). Mirrors @@ -414,13 +414,13 @@ async def unambiguous_soc_code( # noqa: PLR0913 self, industry_descr: str, semantic_search_results: list[dict], - job_title: Optional[str] = None, - job_description: Optional[str] = None, - level_of_education: Optional[str] = None, + job_title: str | None = None, + job_description: str | None = None, + level_of_education: str | None = None, candidates_limit: int = 5, code_digits: int = 4, - correlation_id: Optional[str] = None, - ) -> tuple[UnambiguousResponse, Optional[Any]]: + correlation_id: str | None = None, + ) -> tuple[UnambiguousResponse, Any | None]: """Evaluates codability to a single 4-digit SOC code based on respondent's data. Args: From 7d76e93b6929836b78999505f0508418fa79ef88 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Fri, 29 May 2026 15:05:22 +0000 Subject: [PATCH 05/24] add tests to meet 80% coverage --- tests/test_llm.py | 63 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/tests/test_llm.py b/tests/test_llm.py index ae2c533..d3e0205 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -16,11 +16,18 @@ from occupational_classification.data_access.soc_data_access import ( load_soc_structure as lib_load_soc_structure, ) -from occupational_classification.hierarchy.soc_hierarchy import load_hierarchy +from occupational_classification.hierarchy.soc_hierarchy import ( + SOC, + SocNode, + load_hierarchy, +) from occupational_classification_utils.llm.llm import ClassificationLLM from occupational_classification_utils.llm.prompt import SA_SOC_PROMPT_RAG -from occupational_classification_utils.models.response_model import SocResponse +from occupational_classification_utils.models.response_model import ( + OpenFollowUp, + SocResponse, +) MODEL_NAME = "gemini-2.5-flash" LOCATION = "europe-west2" @@ -78,6 +85,32 @@ def mock_vertex_ai(): yield +@pytest.fixture +def prompt_candidate_soc(): + nodes = [ + SocNode( + soc_code="1234", + group_title="grouptitle1234", + group_description="description12345", + ), + SocNode( + soc_code="2345", + group_title="grouptitle2345", + group_description="description2345", + ), + ] + lookup = {} + for node in nodes: + lookup[str(node.soc_code)] = node + + print("LOOKUP", lookup) + + soc = SOC(nodes=nodes, lookup=lookup) + llm_class = ClassificationLLM(model_name=MODEL_NAME) + llm_class.soc = soc + return llm_class + + @pytest.mark.parametrize( "model, openai_api_key, expected_model", [ @@ -324,3 +357,29 @@ async def test_sa_rag_soc_code_short_list_is_none_raise_value_error( job_description="teach children", short_list=None, ) + + +@pytest.mark.llm +async def test_llm_response_mocked_formulate_open_question( + mocker, prompt_candidate_soc +): + mock_object_dict = {"class_code": "", "class_descriptive": "", "likelihood": 0.5} + mock_object_json = json.dumps(mock_object_dict) + + mock_message = mocker.Mock(spec=AIMessage) + mock_message.content = mock_object_json + + mock_patcher = mocker.patch( # noqa: F841 + "occupational_classification_utils.llm.llm.ChatVertexAI.ainvoke", + return_value=mock_message, + ) + + result = await prompt_candidate_soc.formulate_open_question( + industry_descr="", + job_title="", + job_description="", + level_of_education="", + llm_output="", + ) + assert isinstance(result[0], OpenFollowUp) + assert isinstance(result[1], dict) From 40f0926256b54c4cd079a5e9d7ffc0c18682d3e5 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Tue, 2 Jun 2026 13:37:46 +0000 Subject: [PATCH 06/24] reorder llm.py to reflect changes made in main --- .../llm/llm.py | 264 +++++++++--------- 1 file changed, 132 insertions(+), 132 deletions(-) diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py index d944d7e..1b9d702 100644 --- a/src/occupational_classification_utils/llm/llm.py +++ b/src/occupational_classification_utils/llm/llm.py @@ -391,42 +391,38 @@ async def unambiguous_soc_code( # noqa: PLR0913 return validated_answer, call_dict - async def sa_rag_soc_code( # noqa: PLR0913 + async def formulate_open_question( # noqa: PLR0913 self, industry_descr: str, job_title: str | None = None, job_description: str | None = None, - code_digits: int = config["llm"]["code_digits"], - candidates_limit: int = config["llm"]["candidates_limit"], - short_list: list[dict[Any, Any]] | None = None, - ) -> tuple[SocResponse, list[dict[Any, Any]] | None, Any | None]: - """Generates a SOC classification based on respondent's data using RAG approach. - - Caller must provide short_list (e.g. from vector store API). Mirrors - sic-classification-utils ``sa_rag_sic_code`` (raises when short_list is None). + level_of_education: str | None = None, + llm_output: SocCandidate | None = None, + correlation_id: str | None = None, + ) -> tuple[OpenFollowUp, Any]: + """Formulates an open-ended question using respondent data and survey design guidelines. Args: industry_descr (str): The description of the industry. job_title (str, optional): The job title. Defaults to None. job_description (str, optional): The job description. Defaults to None. - code_digits (int, optional): The number of digits in the generated - SOC code. Defaults to 4. - candidates_limit (int, optional): The maximum number of SOC code candidates - to consider. Defaults to 5. - short_list (list[dict[Any, Any]], optional): A list of results from - embedding or vector store search (e.g. from soc-classification-vector-store). - Each dict should have "code" and "title" keys. + level_of_education (str, optional): The level od education. Defaults to None. + llm_output (SocCandidate, optional): The response from the LLM model. + correlation_id (str, optional): Optional correlation ID for request tracking. Returns: - SocResponse: The generated response to the query. + OpenFollowUp: The generated response to the query. Raises: ValueError: If there is an error during the parsing of the response. - ValueError: If short_list is None. + ValueError: If the default embedding handler is required but + not loaded correctly. """ - def prep_call_dict(industry_descr, job_title, job_description, soc_codes): + def prep_call_dict( + industry_descr, job_title, job_description, level_of_education, llm_output + ): # Helper function to prepare the call dictionary is_job_title_present = job_title is None or job_title in {"", " "} job_title = "Unknown" if is_job_title_present else job_title @@ -438,63 +434,95 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes): job_description = ( "Unknown" if is_job_description_present else job_description ) + level_of_education = ( + "Unknown" + if (level_of_education is None or level_of_education in {"", " "}) + else level_of_education + ) call_dict = { "industry_descr": industry_descr, "job_title": job_title, "job_description": job_description, - "soc_index": soc_codes, + "level_of_education": level_of_education, + "llm_output": str(llm_output), } return call_dict - if short_list is None: - raise ValueError( - "Short list is None - list provided from embedding search." - ) - - soc_codes = self._prompt_candidate_list( - short_list, code_digits=code_digits, candidates_limit=candidates_limit - ) - call_dict = prep_call_dict( industry_descr=industry_descr, job_title=job_title, job_description=job_description, - soc_codes=soc_codes, + level_of_education=level_of_education, + llm_output=llm_output, ) if self.verbose: - final_prompt = self.sa_soc_prompt_rag.format(**call_dict) - logger.debug(f"Final prompt: {final_prompt}") + final_prompt = self.soc_prompt_openfollowup.format(**call_dict) + logger.debug(final_prompt) - chain = self.sa_soc_prompt_rag | self.llm + chain = self.soc_prompt_openfollowup | self.llm + + # Log LLM request sent # Not logging yet - needs to create/import truncate_identifier. + # logger.info( + # "LLM request sent - formulate_open_question", + # job_title=truncate_identifier(job_title), + # job_description=truncate_identifier(job_description), + # level_of_education=truncate_identifier(level_of_education), + # industry_descr=truncate_identifier(industry_descr), + # correlation_id=correlation_id or "", + # ) + llm_start = time.perf_counter() try: response = await chain.ainvoke(call_dict, return_only_outputs=True) - except ValueError as err: - logger.error(f"Error from chain, exit early: {err}", error=str(err)) - validated_answer = SocResponse( - followup="Follow-up question not available due to error.", - reasoning="Error from chain, exit early", + except (ValueError, AttributeError) as err: + logger.error( + f"Error from LLMChain, exit early: {err}", + error=str(err), + correlation_id=correlation_id or "", ) - return validated_answer, short_list, call_dict + logger.warning( + "Error from LLMChain, exit early", + correlation_id=correlation_id or "", + ) + validated_answer = OpenFollowUp( + followup=None, + reasoning="Error from LLMChain, exit early", + ) + return validated_answer, call_dict - if self.verbose: - logger.debug(f"LLM response: {response}") + llm_duration_ms = int((time.perf_counter() - llm_start) * 1000) - parser = PydanticOutputParser( # type: ignore # Suspect langchain ver bug - pydantic_object=SocResponse, - ) + # Parse the output to the desired format + parser = PydanticOutputParser(pydantic_object=OpenFollowUp) try: validated_answer = parser.parse(str(response.content)) + # Log LLM response received after successful parse + has_followup = bool(getattr(validated_answer, "followup", None)) + logger.info( + "LLM response received for open question prompt", + has_followup=str(has_followup), + duration_ms=str(llm_duration_ms), + correlation_id=correlation_id or "", + ) except (ValueError, AttributeError) as parse_error: logger.error( - f"Failed to parse response: {parse_error}", error=str(parse_error) + f"Failed to parse response: {parse_error}", + error=str(parse_error), + correlation_id=correlation_id or "", ) logger.warning( - "Failed to parse response", response_content=str(response.content) + "Failed to parse response", + response_content=str(response.content), + correlation_id=correlation_id or "", + ) + logger.info( + "LLM response received for open question prompt", + has_followup="False", + duration_ms=str(llm_duration_ms), + correlation_id=correlation_id or "", ) - try: chain = FIX_PARSING_PROMPT | self.llm response = await chain.ainvoke( @@ -506,6 +534,7 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes): ) validated_answer = parser.parse(str(response.content)) logger.debug("Successfully parsed reformatted response.") + except (ValueError, AttributeError) as parse_error2: logger.error( f"Failed to parse response again: {parse_error2}", @@ -518,45 +547,52 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes): reasoning = ( f"ERROR parse_error=<{parse_error2}>, response=<{response.content}>" ) - validated_answer = SocResponse( - followup="Follow-up question not available due to error.", + validated_answer = OpenFollowUp( + followup=None, reasoning=reasoning, ) - return validated_answer, short_list, call_dict + if self.verbose: + logger.debug(f"{response=}") - async def formulate_open_question( # noqa: PLR0913 + return validated_answer, call_dict + + async def sa_rag_soc_code( # noqa: PLR0913 self, industry_descr: str, job_title: str | None = None, job_description: str | None = None, - level_of_education: str | None = None, - llm_output: SocCandidate | None = None, - correlation_id: str | None = None, - ) -> tuple[OpenFollowUp, Any]: - """Formulates an open-ended question using respondent data and survey design guidelines. + code_digits: int = config["llm"]["code_digits"], + candidates_limit: int = config["llm"]["candidates_limit"], + short_list: list[dict[Any, Any]] | None = None, + ) -> tuple[SocResponse, list[dict[Any, Any]] | None, Any | None]: + """Generates a SOC classification based on respondent's data using RAG approach. + + Caller must provide short_list (e.g. from vector store API). Mirrors + sic-classification-utils ``sa_rag_sic_code`` (raises when short_list is None). Args: industry_descr (str): The description of the industry. job_title (str, optional): The job title. Defaults to None. job_description (str, optional): The job description. Defaults to None. - level_of_education (str, optional): The level od education. Defaults to None. - llm_output (SocCandidate, optional): The response from the LLM model. - correlation_id (str, optional): Optional correlation ID for request tracking. + code_digits (int, optional): The number of digits in the generated + SOC code. Defaults to 4. + candidates_limit (int, optional): The maximum number of SOC code candidates + to consider. Defaults to 5. + short_list (list[dict[Any, Any]], optional): A list of results from + embedding or vector store search (e.g. from soc-classification-vector-store). + Each dict should have "code" and "title" keys. Returns: - OpenFollowUp: The generated response to the query. + SocResponse: The generated response to the query. Raises: ValueError: If there is an error during the parsing of the response. - ValueError: If the default embedding handler is required but - not loaded correctly. + ValueError: If short_list is None. """ - def prep_call_dict( - industry_descr, job_title, job_description, level_of_education, llm_output - ): + def prep_call_dict(industry_descr, job_title, job_description, soc_codes): # Helper function to prepare the call dictionary is_job_title_present = job_title is None or job_title in {"", " "} job_title = "Unknown" if is_job_title_present else job_title @@ -568,95 +604,63 @@ def prep_call_dict( job_description = ( "Unknown" if is_job_description_present else job_description ) - level_of_education = ( - "Unknown" - if (level_of_education is None or level_of_education in {"", " "}) - else level_of_education - ) call_dict = { "industry_descr": industry_descr, "job_title": job_title, "job_description": job_description, - "level_of_education": level_of_education, - "llm_output": str(llm_output), + "soc_index": soc_codes, } return call_dict + if short_list is None: + raise ValueError( + "Short list is None - list provided from embedding search." + ) + + soc_codes = self._prompt_candidate_list( + short_list, code_digits=code_digits, candidates_limit=candidates_limit + ) + call_dict = prep_call_dict( industry_descr=industry_descr, job_title=job_title, job_description=job_description, - level_of_education=level_of_education, - llm_output=llm_output, + soc_codes=soc_codes, ) if self.verbose: - final_prompt = self.soc_prompt_openfollowup.format(**call_dict) - logger.debug(final_prompt) - - chain = self.soc_prompt_openfollowup | self.llm + final_prompt = self.sa_soc_prompt_rag.format(**call_dict) + logger.debug(f"Final prompt: {final_prompt}") - # Log LLM request sent # Not logging yet - needs to create/import truncate_identifier. - # logger.info( - # "LLM request sent - formulate_open_question", - # job_title=truncate_identifier(job_title), - # job_description=truncate_identifier(job_description), - # level_of_education=truncate_identifier(level_of_education), - # industry_descr=truncate_identifier(industry_descr), - # correlation_id=correlation_id or "", - # ) - llm_start = time.perf_counter() + chain = self.sa_soc_prompt_rag | self.llm try: response = await chain.ainvoke(call_dict, return_only_outputs=True) - except (ValueError, AttributeError) as err: - logger.error( - f"Error from LLMChain, exit early: {err}", - error=str(err), - correlation_id=correlation_id or "", - ) - logger.warning( - "Error from LLMChain, exit early", - correlation_id=correlation_id or "", - ) - validated_answer = OpenFollowUp( - followup=None, - reasoning="Error from LLMChain, exit early", + except ValueError as err: + logger.error(f"Error from chain, exit early: {err}", error=str(err)) + validated_answer = SocResponse( + followup="Follow-up question not available due to error.", + reasoning="Error from chain, exit early", ) - return validated_answer, call_dict + return validated_answer, short_list, call_dict - llm_duration_ms = int((time.perf_counter() - llm_start) * 1000) + if self.verbose: + logger.debug(f"LLM response: {response}") - # Parse the output to the desired format - parser = PydanticOutputParser(pydantic_object=OpenFollowUp) + parser = PydanticOutputParser( # type: ignore # Suspect langchain ver bug + pydantic_object=SocResponse, + ) try: validated_answer = parser.parse(str(response.content)) - # Log LLM response received after successful parse - has_followup = bool(getattr(validated_answer, "followup", None)) - logger.info( - "LLM response received for open question prompt", - has_followup=str(has_followup), - duration_ms=str(llm_duration_ms), - correlation_id=correlation_id or "", - ) except (ValueError, AttributeError) as parse_error: logger.error( - f"Failed to parse response: {parse_error}", - error=str(parse_error), - correlation_id=correlation_id or "", + f"Failed to parse response: {parse_error}", error=str(parse_error) ) logger.warning( - "Failed to parse response", - response_content=str(response.content), - correlation_id=correlation_id or "", - ) - logger.info( - "LLM response received for open question prompt", - has_followup="False", - duration_ms=str(llm_duration_ms), - correlation_id=correlation_id or "", + "Failed to parse response", response_content=str(response.content) ) + try: chain = FIX_PARSING_PROMPT | self.llm response = await chain.ainvoke( @@ -668,7 +672,6 @@ def prep_call_dict( ) validated_answer = parser.parse(str(response.content)) logger.debug("Successfully parsed reformatted response.") - except (ValueError, AttributeError) as parse_error2: logger.error( f"Failed to parse response again: {parse_error2}", @@ -681,12 +684,9 @@ def prep_call_dict( reasoning = ( f"ERROR parse_error=<{parse_error2}>, response=<{response.content}>" ) - validated_answer = OpenFollowUp( - followup=None, + validated_answer = SocResponse( + followup="Follow-up question not available due to error.", reasoning=reasoning, ) - if self.verbose: - logger.debug(f"{response=}") - - return validated_answer, call_dict + return validated_answer, short_list, call_dict From b419c6c9f854313b6d940ccbb61d29a4bdfd018c Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Tue, 2 Jun 2026 14:19:31 +0000 Subject: [PATCH 07/24] Use more general validation method --- src/occupational_classification_utils/llm/llm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py index 1b9d702..5361008 100644 --- a/src/occupational_classification_utils/llm/llm.py +++ b/src/occupational_classification_utils/llm/llm.py @@ -42,7 +42,7 @@ ) from occupational_classification_utils.models.response_model import ( OpenFollowUp, - SocCandidate, + RagCandidate, SocResponse, UnambiguousResponse, ) @@ -397,7 +397,7 @@ async def formulate_open_question( # noqa: PLR0913 job_title: str | None = None, job_description: str | None = None, level_of_education: str | None = None, - llm_output: SocCandidate | None = None, + llm_output: RagCandidate | None = None, correlation_id: str | None = None, ) -> tuple[OpenFollowUp, Any]: """Formulates an open-ended question using respondent data and survey design guidelines. @@ -407,7 +407,7 @@ async def formulate_open_question( # noqa: PLR0913 job_title (str, optional): The job title. Defaults to None. job_description (str, optional): The job description. Defaults to None. level_of_education (str, optional): The level od education. Defaults to None. - llm_output (SocCandidate, optional): The response from the LLM model. + llm_output (RagCandidate, optional): The response from the LLM model. correlation_id (str, optional): Optional correlation ID for request tracking. Returns: From a9b7c45d09bdc997623550f33b406fe8bc01ba10 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Tue, 2 Jun 2026 14:24:26 +0000 Subject: [PATCH 08/24] remove duplications in prompt.py --- .../llm/prompt.py | 67 ------------------- 1 file changed, 67 deletions(-) diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py index 43ed998..78be492 100644 --- a/src/occupational_classification_utils/llm/prompt.py +++ b/src/occupational_classification_utils/llm/prompt.py @@ -280,70 +280,3 @@ "format_instructions": parser_followup_open.get_format_instructions(), }, ) - - -_open_follow_up = """"You are an expert survey methodologist specialising in - UK industrial classification (UK SOC). Generate one open-ended follow-up question - to help assign the most relevant UK SOC code. - -Objective -- Produce exactly one question that elicits the key information needed to distinguish - between the shortlisted SOC candidates, focusing on the employer's main business activity. - -Inputs -- Respondent data: -- Company's main activity: {industry_descr} -- Job title: {job_title} -- Job description: {job_description} -- Level of Education: {level_of_education} -- Shortlist from previous model: {llm_output} -- Note: These are candidate SOC categories; do not mention codes or "SOC" to the respondent. - -How to decide what to ask -- Identify the smallest, most informative difference among the candidates and target that with a single question. -- Prioritise discriminators in this order: -1) Stage in the value chain (e.g., manufacture/processing vs wholesale vs retail vs repair/installation vs - rental/leasing vs publishing/software vs consultancy/training). -2) Main product or service category (what goods/services the employer mainly provides). -3) Main customer type (households vs businesses vs government/health/education). -4) Delivery mode or setting (on-site vs online; physical goods vs digital; own-brand vs third-party). -- Ask about only one discriminator—the one most likely to resolve the ambiguity. - -Quality standards -- Language and clarity: - - Use plain British English; avoid or define jargon and abbreviations. - - Keep the single question concise (max 25 words), grammatically correct, and neutral. - - Use "employer" for for-profit; use "organisation" for non-profits, charities, public bodies, and education. - Default to "employer", if ambiguous. - - Refer to the present situation (e.g., "currently", "main"). - - Do not mention SOC or any code numbers. - - Do not ask for company names, client names, or other personal/sensitive data. -- Question structure: - - Start with "What", "How", "Which", or "Where". - - Focus on the employer's main business activities, products, or services—not the respondent's personal tasks. - - One issue per question; no A/B or either/or phrasing; avoid binary questions. - - Limit to one sentence ending with a question mark. - - You may add one additional sentence with broad, non-leading examples covering a wide range of options; - omit examples if they would be leading. -- Respondent considerations: - - Make it easy to answer in a few words. - - Ask only what a typical employee would reasonably know. - - Avoid requiring calculations or percentages. - -Edge cases -- If the shortlist is empty or clearly points to one category, ask a general clarifying question about - the main product/service or value-chain stage to confirm classification. -- Do not output explanations or reasoning; only the formatted result. - -Output format -- Return output that strictly follows: -{format_instructions} -""" -parser_followup_open = PydanticOutputParser(pydantic_object=OpenFollowUp) - -SOC_PROMPT_OPENFOLLOWUP = PromptTemplate.from_template( - template=_core_prompt + _open_follow_up, - partial_variables={ - "format_instructions": parser_followup_open.get_format_instructions(), - }, -) From 7f7730695b8c23151c6d8035309c1a1836bc098d Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Tue, 2 Jun 2026 14:28:21 +0000 Subject: [PATCH 09/24] uncomment code in llm.py --- .../llm/llm.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py index 5361008..de7ad71 100644 --- a/src/occupational_classification_utils/llm/llm.py +++ b/src/occupational_classification_utils/llm/llm.py @@ -463,15 +463,15 @@ def prep_call_dict( chain = self.soc_prompt_openfollowup | self.llm - # Log LLM request sent # Not logging yet - needs to create/import truncate_identifier. - # logger.info( - # "LLM request sent - formulate_open_question", - # job_title=truncate_identifier(job_title), - # job_description=truncate_identifier(job_description), - # level_of_education=truncate_identifier(level_of_education), - # industry_descr=truncate_identifier(industry_descr), - # correlation_id=correlation_id or "", - # ) + # Log LLM request sent + logger.info( + "LLM request sent - formulate_open_question", + job_title=truncate_identifier(job_title), + job_description=truncate_identifier(job_description), + level_of_education=truncate_identifier(level_of_education), + industry_descr=truncate_identifier(industry_descr), + correlation_id=correlation_id or "", + ) llm_start = time.perf_counter() try: From 0fbf380b6020a2a205a6ba643efdb19125a06e08 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Tue, 2 Jun 2026 15:16:15 +0000 Subject: [PATCH 10/24] correct validation method --- .../models/response_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/occupational_classification_utils/models/response_model.py b/src/occupational_classification_utils/models/response_model.py index a7624d8..768b56d 100644 --- a/src/occupational_classification_utils/models/response_model.py +++ b/src/occupational_classification_utils/models/response_model.py @@ -17,7 +17,7 @@ MAX_ALT_CANDIDATES: Maximum number of alternative candidates allowed in certain models. """ -from pydantic import BaseModel, Field, model_validator +from pydantic import BaseModel, Field, field_validator, model_validator from occupational_classification_utils.utils.constants import MAX_ALT_CANDIDATES @@ -323,6 +323,8 @@ class UnambiguousResponse(BaseModel): min_length=50, # Ensure detailed reasoning is provided ) + @field_validator("alt_candidates") + @classmethod def validate_alt_candidates(cls, v): """Validates the number of alternative candidates. From 9117aa1da55597aa26c4e2aa7e92e27f7da918cf Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Wed, 3 Jun 2026 14:24:58 +0000 Subject: [PATCH 11/24] hash level_of_education --- .../llm/llm.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py index de7ad71..bbc8deb 100644 --- a/src/occupational_classification_utils/llm/llm.py +++ b/src/occupational_classification_utils/llm/llm.py @@ -122,7 +122,7 @@ async def get_soc_code( self, job_title: str, job_description: str, - level_of_education: str, + # level_of_education: str, manage_others: bool, industry_descr: str, ) -> SocResponse: @@ -148,7 +148,7 @@ async def get_soc_code( { "job_title": job_title, "job_description": job_description, - "level_of_education": level_of_education, + # "level_of_education": level_of_education, "manage_others": manage_others, "industry_descr": industry_descr, }, @@ -391,12 +391,12 @@ async def unambiguous_soc_code( # noqa: PLR0913 return validated_answer, call_dict - async def formulate_open_question( # noqa: PLR0913 + async def formulate_open_question( self, industry_descr: str, job_title: str | None = None, job_description: str | None = None, - level_of_education: str | None = None, + # level_of_education: str | None = None, llm_output: RagCandidate | None = None, correlation_id: str | None = None, ) -> tuple[OpenFollowUp, Any]: @@ -421,7 +421,11 @@ async def formulate_open_question( # noqa: PLR0913 """ def prep_call_dict( - industry_descr, job_title, job_description, level_of_education, llm_output + industry_descr, + job_title, + job_description, + # level_of_education, + llm_output, ): # Helper function to prepare the call dictionary is_job_title_present = job_title is None or job_title in {"", " "} @@ -434,17 +438,17 @@ def prep_call_dict( job_description = ( "Unknown" if is_job_description_present else job_description ) - level_of_education = ( - "Unknown" - if (level_of_education is None or level_of_education in {"", " "}) - else level_of_education - ) + # level_of_education = ( + # "Unknown" + # if (level_of_education is None or level_of_education in {"", " "}) + # else level_of_education + # ) call_dict = { "industry_descr": industry_descr, "job_title": job_title, "job_description": job_description, - "level_of_education": level_of_education, + # "level_of_education": level_of_education, "llm_output": str(llm_output), } return call_dict @@ -453,7 +457,7 @@ def prep_call_dict( industry_descr=industry_descr, job_title=job_title, job_description=job_description, - level_of_education=level_of_education, + # level_of_education=level_of_education, llm_output=llm_output, ) @@ -468,7 +472,7 @@ def prep_call_dict( "LLM request sent - formulate_open_question", job_title=truncate_identifier(job_title), job_description=truncate_identifier(job_description), - level_of_education=truncate_identifier(level_of_education), + # level_of_education=truncate_identifier(level_of_education), industry_descr=truncate_identifier(industry_descr), correlation_id=correlation_id or "", ) From 0788eb936cbc61300f0206ac528c97c4b6c1ceef Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Wed, 3 Jun 2026 14:44:38 +0000 Subject: [PATCH 12/24] remove level of education from test and prompt --- src/occupational_classification_utils/llm/prompt.py | 1 - tests/test_llm.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py index 78be492..ee92fb4 100644 --- a/src/occupational_classification_utils/llm/prompt.py +++ b/src/occupational_classification_utils/llm/prompt.py @@ -50,7 +50,6 @@ ===Respondent Data=== - Job Title: {job_title} - Job Description: {job_description} -- Level of Education: {level_of_education} - Line Management Responsibilities: {manage_others} - Company's main activity: {industry_descr} diff --git a/tests/test_llm.py b/tests/test_llm.py index ba91dd5..5bf56ef 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -246,7 +246,7 @@ async def test_llm_response_mocked_get_soc_code(): result = await ClassificationLLM(model_name=MODEL_NAME).get_soc_code( job_title="teacher", job_description="teach children", - level_of_education="degree", + # level_of_education="degree", manage_others=False, industry_descr="school", ) @@ -457,7 +457,7 @@ async def test_llm_response_mocked_formulate_open_question( industry_descr="", job_title="", job_description="", - level_of_education="", + # level_of_education="", llm_output="", ) assert isinstance(result[0], OpenFollowUp) From 0179a004e17fbdc18dba11ac3a7bdd8a7cc8754d Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Wed, 3 Jun 2026 15:29:43 +0000 Subject: [PATCH 13/24] Revert "remove level of education from test and prompt" This reverts commit 0788eb936cbc61300f0206ac528c97c4b6c1ceef. --- src/occupational_classification_utils/llm/prompt.py | 1 + tests/test_llm.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py index ee92fb4..78be492 100644 --- a/src/occupational_classification_utils/llm/prompt.py +++ b/src/occupational_classification_utils/llm/prompt.py @@ -50,6 +50,7 @@ ===Respondent Data=== - Job Title: {job_title} - Job Description: {job_description} +- Level of Education: {level_of_education} - Line Management Responsibilities: {manage_others} - Company's main activity: {industry_descr} diff --git a/tests/test_llm.py b/tests/test_llm.py index 5bf56ef..ba91dd5 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -246,7 +246,7 @@ async def test_llm_response_mocked_get_soc_code(): result = await ClassificationLLM(model_name=MODEL_NAME).get_soc_code( job_title="teacher", job_description="teach children", - # level_of_education="degree", + level_of_education="degree", manage_others=False, industry_descr="school", ) @@ -457,7 +457,7 @@ async def test_llm_response_mocked_formulate_open_question( industry_descr="", job_title="", job_description="", - # level_of_education="", + level_of_education="", llm_output="", ) assert isinstance(result[0], OpenFollowUp) From 00988762c67759ad7d5aa83028e2bd834e624d18 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Wed, 3 Jun 2026 15:30:29 +0000 Subject: [PATCH 14/24] Revert "hash level_of_education" This reverts commit 9117aa1da55597aa26c4e2aa7e92e27f7da918cf. --- .../llm/llm.py | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py index bbc8deb..de7ad71 100644 --- a/src/occupational_classification_utils/llm/llm.py +++ b/src/occupational_classification_utils/llm/llm.py @@ -122,7 +122,7 @@ async def get_soc_code( self, job_title: str, job_description: str, - # level_of_education: str, + level_of_education: str, manage_others: bool, industry_descr: str, ) -> SocResponse: @@ -148,7 +148,7 @@ async def get_soc_code( { "job_title": job_title, "job_description": job_description, - # "level_of_education": level_of_education, + "level_of_education": level_of_education, "manage_others": manage_others, "industry_descr": industry_descr, }, @@ -391,12 +391,12 @@ async def unambiguous_soc_code( # noqa: PLR0913 return validated_answer, call_dict - async def formulate_open_question( + async def formulate_open_question( # noqa: PLR0913 self, industry_descr: str, job_title: str | None = None, job_description: str | None = None, - # level_of_education: str | None = None, + level_of_education: str | None = None, llm_output: RagCandidate | None = None, correlation_id: str | None = None, ) -> tuple[OpenFollowUp, Any]: @@ -421,11 +421,7 @@ async def formulate_open_question( """ def prep_call_dict( - industry_descr, - job_title, - job_description, - # level_of_education, - llm_output, + industry_descr, job_title, job_description, level_of_education, llm_output ): # Helper function to prepare the call dictionary is_job_title_present = job_title is None or job_title in {"", " "} @@ -438,17 +434,17 @@ def prep_call_dict( job_description = ( "Unknown" if is_job_description_present else job_description ) - # level_of_education = ( - # "Unknown" - # if (level_of_education is None or level_of_education in {"", " "}) - # else level_of_education - # ) + level_of_education = ( + "Unknown" + if (level_of_education is None or level_of_education in {"", " "}) + else level_of_education + ) call_dict = { "industry_descr": industry_descr, "job_title": job_title, "job_description": job_description, - # "level_of_education": level_of_education, + "level_of_education": level_of_education, "llm_output": str(llm_output), } return call_dict @@ -457,7 +453,7 @@ def prep_call_dict( industry_descr=industry_descr, job_title=job_title, job_description=job_description, - # level_of_education=level_of_education, + level_of_education=level_of_education, llm_output=llm_output, ) @@ -472,7 +468,7 @@ def prep_call_dict( "LLM request sent - formulate_open_question", job_title=truncate_identifier(job_title), job_description=truncate_identifier(job_description), - # level_of_education=truncate_identifier(level_of_education), + level_of_education=truncate_identifier(level_of_education), industry_descr=truncate_identifier(industry_descr), correlation_id=correlation_id or "", ) From cb4be4ef2d149ca3a773543a4b59021a970d7455 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Wed, 3 Jun 2026 16:00:01 +0000 Subject: [PATCH 15/24] level of education typehint: optional str --- src/occupational_classification_utils/llm/llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py index de7ad71..d2bd0cd 100644 --- a/src/occupational_classification_utils/llm/llm.py +++ b/src/occupational_classification_utils/llm/llm.py @@ -122,7 +122,7 @@ async def get_soc_code( self, job_title: str, job_description: str, - level_of_education: str, + level_of_education: str | None, manage_others: bool, industry_descr: str, ) -> SocResponse: From 5d24c2ecf9b59f5106a3bec0089b4585bdde0e7c Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Wed, 3 Jun 2026 16:14:38 +0000 Subject: [PATCH 16/24] add level of education field in the followup quesiton --- src/occupational_classification_utils/llm/prompt.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py index 78be492..01aa70a 100644 --- a/src/occupational_classification_utils/llm/prompt.py +++ b/src/occupational_classification_utils/llm/prompt.py @@ -229,6 +229,7 @@ - Company's main activity: {industry_descr} - Job title: {job_title} - Job description: {job_description} +- Level of Education: {level_of_education} - Shortlist from previous model: {llm_output} - Note: These are candidate occupational categories; do not mention codes or "SOC" to the respondent. From 91ab1f13a74dc1d2eda2c4ad3bdb95e2e493e568 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Thu, 4 Jun 2026 16:54:03 +0000 Subject: [PATCH 17/24] level of education for stages 2 and 3 in the pipeline --- src/occupational_classification_utils/llm/llm.py | 13 +++++++++++-- src/occupational_classification_utils/llm/prompt.py | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py index d2bd0cd..c4390e1 100644 --- a/src/occupational_classification_utils/llm/llm.py +++ b/src/occupational_classification_utils/llm/llm.py @@ -269,6 +269,7 @@ async def unambiguous_soc_code( # noqa: PLR0913 semantic_search_results: list[dict], job_title: str | None = None, job_description: str | None = None, + level_of_education: str | None = None, candidates_limit: int = config["llm"]["candidates_limit"], code_digits: int = config["llm"]["code_digits"], correlation_id: str | None = None, @@ -293,6 +294,7 @@ async def unambiguous_soc_code( # noqa: PLR0913 "industry_descr": industry_descr, "job_title": job_title, "job_description": job_description, + "level_of_education": level_of_education, "soc_candidates": soc_candidates, } @@ -305,6 +307,7 @@ async def unambiguous_soc_code( # noqa: PLR0913 "LLM request sent - unambiguous_soc_code", job_title=truncate_identifier(job_title), job_description=truncate_identifier(job_description), + level_of_education=truncate_identifier(str(level_of_education)), industry_descr=truncate_identifier(industry_descr), correlation_id=correlation_id or "", ) @@ -468,7 +471,7 @@ def prep_call_dict( "LLM request sent - formulate_open_question", job_title=truncate_identifier(job_title), job_description=truncate_identifier(job_description), - level_of_education=truncate_identifier(level_of_education), + level_of_education=truncate_identifier(str(level_of_education)), industry_descr=truncate_identifier(industry_descr), correlation_id=correlation_id or "", ) @@ -562,6 +565,7 @@ async def sa_rag_soc_code( # noqa: PLR0913 industry_descr: str, job_title: str | None = None, job_description: str | None = None, + level_of_education: str | None = None, code_digits: int = config["llm"]["code_digits"], candidates_limit: int = config["llm"]["candidates_limit"], short_list: list[dict[Any, Any]] | None = None, @@ -575,6 +579,7 @@ async def sa_rag_soc_code( # noqa: PLR0913 industry_descr (str): The description of the industry. job_title (str, optional): The job title. Defaults to None. job_description (str, optional): The job description. Defaults to None. + level_of_education (str): The level of education required for the job. code_digits (int, optional): The number of digits in the generated SOC code. Defaults to 4. candidates_limit (int, optional): The maximum number of SOC code candidates @@ -592,7 +597,9 @@ async def sa_rag_soc_code( # noqa: PLR0913 """ - def prep_call_dict(industry_descr, job_title, job_description, soc_codes): + def prep_call_dict( + industry_descr, job_title, job_description, level_of_education, soc_codes + ): # Helper function to prepare the call dictionary is_job_title_present = job_title is None or job_title in {"", " "} job_title = "Unknown" if is_job_title_present else job_title @@ -609,6 +616,7 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes): "industry_descr": industry_descr, "job_title": job_title, "job_description": job_description, + "level_of_education": level_of_education, "soc_index": soc_codes, } return call_dict @@ -626,6 +634,7 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes): industry_descr=industry_descr, job_title=job_title, job_description=job_description, + level_of_education=level_of_education, soc_codes=soc_codes, ) diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py index 01aa70a..a289e0c 100644 --- a/src/occupational_classification_utils/llm/prompt.py +++ b/src/occupational_classification_utils/llm/prompt.py @@ -197,6 +197,7 @@ - Company's main activity: {industry_descr} - Job Title: {job_title} - Job Description: {job_description} +- Level of Education: {level_of_education} ===Shortlist=== {soc_candidates} From 4fb77660d5bbe80a11beb5f4fa75329b29d50e04 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Thu, 4 Jun 2026 17:06:31 +0000 Subject: [PATCH 18/24] allow zero alternative candidates --- .../models/response_model.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/occupational_classification_utils/models/response_model.py b/src/occupational_classification_utils/models/response_model.py index 768b56d..177813e 100644 --- a/src/occupational_classification_utils/models/response_model.py +++ b/src/occupational_classification_utils/models/response_model.py @@ -314,7 +314,6 @@ class UnambiguousResponse(BaseModel): default_factory=list, description="Short list of possible classification codes with their " "descriptive labels and estimated likelihoods.", - min_length=1, # Ensure there's always at least one candidate max_length=10, # Limit to less than 10 candidates ) @@ -328,7 +327,7 @@ class UnambiguousResponse(BaseModel): def validate_alt_candidates(cls, v): """Validates the number of alternative candidates. - Ensures that the number of candidates is between 1 and the maximum allowed. + Ensures that the number of candidates is less or equal to the maximum allowed. Args: v (list): The list of alternative candidates. @@ -339,8 +338,8 @@ def validate_alt_candidates(cls, v): Raises: ValueError: If the number of candidates is not within the allowed range. """ - if not 1 <= len(v) <= MAX_ALT_CANDIDATES: - raise ValueError("alt_candidates must contain between 1 and 10 items.") + if not len(v) <= MAX_ALT_CANDIDATES: + raise ValueError("alt_candidates must contain no more than 10 items.") return v From 88337c01e478613c672ca5f4d56f475ab198c0ce Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Mon, 15 Jun 2026 09:59:17 +0000 Subject: [PATCH 19/24] add level of education to the prompt --- src/occupational_classification_utils/llm/prompt.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py index a289e0c..3955d04 100644 --- a/src/occupational_classification_utils/llm/prompt.py +++ b/src/occupational_classification_utils/llm/prompt.py @@ -104,6 +104,7 @@ - Company's main activity: {industry_descr} - Job Title: {job_title} - Job Description: {job_description} +- Level of Education: {level_of_education} ===Relevant subset of UK SOC 2020=== {soc_index} From 07c23f14d0c7dc40e402c2b272fb52b20ec7d7d5 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Mon, 15 Jun 2026 15:10:24 +0000 Subject: [PATCH 20/24] create soc lookup --- notebooks/create_soc_lookup_2026_04.py | 673 +++++++++++++++++++++++++ 1 file changed, 673 insertions(+) create mode 100644 notebooks/create_soc_lookup_2026_04.py diff --git a/notebooks/create_soc_lookup_2026_04.py b/notebooks/create_soc_lookup_2026_04.py new file mode 100644 index 0000000..2b246f3 --- /dev/null +++ b/notebooks/create_soc_lookup_2026_04.py @@ -0,0 +1,673 @@ +# %% +# pylint: disable=C0103, C0114, C0301, R0801, W0105 + +"""Noetbook attempting to create a SOC DIRECT LOOKUP. + +Diasbling duplicate code - methods needs to be changed in other repos to reflect the change in data. +Diasbling line-too-long: commentary and discussion. +Disabling pointless-string-statement: comments to the code for reading clarity. +""" + +# %% +import ast +import re + +import dotenv +import pandas as pd + +# %% +from occupational_classification.data_access.soc_data_access import ( + _combine_soc_index_job_title as combine_job_title, +) + +# %% +input_folder = "soc_data" + +file_name = "ashe_llm_soc_codes" + +file_suffix = "_2026_05_19" + +# %% +# read the data +data = pd.read_csv(f"notebooks/{input_folder}/{file_name}{file_suffix}.csv") + +# %% +# use only columns needed +data = data[ + [ + "documents", + # "corrected_spelling", + "label", + "codable", + "llm_soc_code", + "llm_soc_candidates", + "reasoning", + ] +] + + +# %% +def parse_string(text): + """Convert string to a list of dictionaries for SOC candidates.""" + if isinstance(text, str): + processed = text.replace("SocCandidate(", "dict(") + processed = re.sub(r"(\w+)=", r'"\1":', processed) + processed = processed.replace("dict(", "{").replace(")", "}") + return ast.literal_eval(processed) + return [] + + +# %% +# string to list of dictionaries +data["llm_soc_candidates"] = data["llm_soc_candidates"].map(parse_string) + +# %% +print(f"llm {data['codable'].value_counts()}") + + +# %% +def access_soc_code_from_candidate_list(row_values: list[dict]) -> list[str]: + """From list of potential SOC candidates, access SOC codes. + + Args: + row_values (list[dict]): list of dictionaries with SOC candidates. + + Return: + candidates (list[str]): list of 4-digit candidate codes. + """ + if isinstance(row_values, list): + candidates = [] + for row in row_values: + if len(row) < 1: + return None + candidates.append(row.get("soc_code")) + else: + return None + return candidates + + +# %% +def float_to_list_of_codes(row_values: float) -> str: + """Convert float to a string of codes (str). + + Args: + row_values (float): SOC code as a float. + + Return: + row_values (str): SOC code as a string. + """ + if isinstance(row_values, float): + codes_list = [f"{row_values:.0f}"] + print(type(codes_list)) + return codes_list + return [row_values] + + +# %% +data["label"] = data["label"].astype(str) + +# %% +msk = data["llm_soc_code"].isna() # take rows, where LLM didn't provide a code. + +# %% +data.loc[~msk, "llm_soc_code"] = data.loc[~msk, "llm_soc_code"].apply( + float_to_list_of_codes +) + +# %% +data.loc[msk, "llm_soc_code"] = data.loc[msk, "llm_soc_candidates"].apply( + access_soc_code_from_candidate_list +) + + +# %% +def check_agreement(df: pd.DataFrame, df_source: str): + """Checks agreement between ASHE and LLM assigned codes. + + Args: + df (pd.DataFrame): dataframe containing columns 'label' and 'llm_soc_code' with codes. + df_source (str): String indicaitng the source of the dataframe (ASHE or soc index). + """ + agr, in_cand = 0, 0 + # check if 'label' is the same as 'llm_soc_code'. + # If LLM uncodable, check if 'label' in candidates. + for row in range(len(df)): + if len(df.iloc[row]["llm_soc_code"]) == 1: + agr += df.iloc[row]["label"] == df.iloc[row]["llm_soc_code"][0] + df.loc[row, "codable"] = True + elif len(df.iloc[row]["llm_soc_code"]) > 1: + in_cand += df.iloc[row]["label"] in df.iloc[row]["llm_soc_code"] + + print( + f"Agreement full {df_source}: {agr} ({round(agr / len(df), 2) * 100}% of all rows)" + ) + print( + f"Agreement (code in candidates) {df_source}: {in_cand} ({round(in_cand / len(df), 2) * 100}% of all rows)" # pylint: disable=C0301 + ) + print( + f"Agreement (label the same or within candidates) {df_source}: {agr + in_cand} ({round((agr + in_cand) / len(df), 2) * 100}% of all rows)" # pylint: disable=C0301 + ) + + +# %% +check_agreement(data, "ASHE and LLM") + +# %% +len(data) + + +# %% +def check_code_count(df: pd.DataFrame, df_source: str): + """Check if the LLM assigned a sigle, multiple, or none codes when assessing SOC codes. + + Args: + df (pd.DataFrame): dataframe containing LLM assessment of SOC codes. + Requires 'llm_soc_code' column. + df_source (str): String indicaitng the source of the dataframe (ASHE or soc index). + """ + longer, shorter, one_code = 0, 0, 0 + for code in df["llm_soc_code"]: + if isinstance(code, list): + if len(code) > 1: + longer += 1 + if len(code) < 1: + shorter += 1 + if len(code) == 1: + one_code += 1 + else: + one_code += 1 + + print( + f"More than one code {df_source}: {longer} ({round(longer / len(df) * 100, 2)}%)" + ) + print( + f"No codes assigned {df_source}: {shorter} ({round(shorter / len(df) * 100, 2)}%)" + ) + print( + f"One code assigned {df_source}: {one_code} ({round(one_code / len(df) * 100, 2)}%)" + ) + + +# %% +check_code_count(data, "ASHE") + +# %% +full_data_codable = data[data["codable"]] + +# %% +data_only_columns = data[["documents", "llm_soc_code"]] + +# %% +data_one_code = data_only_columns[data_only_columns["llm_soc_code"].str.len() == 1] + +# %% +e = data_one_code["llm_soc_code"].str[0] + +# %% +numeric = pd.to_numeric(e, errors="coerce") + +# %% +data_one_code["llm_soc_code"] = numeric + +# %% +data_one_code = data_one_code.dropna(subset=["llm_soc_code"]) + +# %% +data_one_code["llm_soc_code"] = data_one_code["llm_soc_code"].astype(int) + +# %% +data_one_code = data_one_code.rename( + columns={"corrected_spelling": "documents", "llm_soc_code": "label"} +) + +# %% +data_one_code = data_one_code.drop_duplicates( + subset=["documents", "label"], keep="last", ignore_index=True +) + + +# %% +def load_soc_framework(filepath: str) -> pd.DataFrame: + """Load SOC structure. + + Provides structure with all levels and names of the SOC 2020. + + Args: + filepath (str): A path to the file containing SOC Structure. + + Returns: + pd.DataFrame: A DataFrame containing group code, group title, + group description, typical entry routes and associated qualifications, + and list of tasks. + """ + soc_df = pd.read_excel( + filepath, + sheet_name="SOC2020 framework", + usecols=[ + "SOC2020 Unit Group", + "SOC2020 Group Title", + ], + dtype=str, + ) + soc_df.columns = [ + col.lower().replace(" ", "_").replace("__", "_").replace("\n", "") + for col in soc_df.columns + ] + soc_df = soc_df.rename( + columns={"soc2020_unit_group": "code", "soc2020_group_title": "title"} + ) + + for col in soc_df.columns: + soc_df[col] = soc_df[col].str.strip() + + return soc_df + + +# %% +knowledge_bucket = dotenv.get_key(".env", "KNOWLEDGE_BUCKET") + +# %% +s_list = load_soc_framework( + f"{knowledge_bucket}soc2020volume2thecodingindexexcel03122025.xlsx" +) +s_list = s_list[s_list["code"].notna()] + +# %% +codes_from_framework_str = list(s_list["code"].value_counts().keys()) + +# %% +codes_from_framework_int = [] +for k in codes_from_framework_str: + codes_from_framework_int.append(int(k)) + +# %% +phantom_codes = ( + data_one_code[~data_one_code["label"].isin(codes_from_framework_int)]["label"] + .value_counts() + .keys() +) + +# %% +print("codes that don't appear in the SOC codes list\n", phantom_codes) + +# %% +data_one_code_no_phantoms = data_one_code[ + data_one_code["label"].isin(codes_from_framework_int) +] + +# %% +coded_all = len(data_one_code) + +# %% +coded_no_phantom = len(data_one_code_no_phantoms) + +# %% +diff = len(data_one_code) - len(data_one_code_no_phantoms) + +# %% +drop = diff / coded_all * 100 + +# %% +print( + f"with phantoms: {coded_all}\ncoded no phantoms: {coded_no_phantom}\ndiff: {diff}\ndrop(%): {drop:.2f}" # pylint: disable=C0301 +) + +# %% +print(data_one_code_no_phantoms) + +# %% +print( + "check if there is any duplicates\n", + data_one_code_no_phantoms[ + data_one_code_no_phantoms.duplicated(subset=["documents"]) + ], +) + +# %% +data_one_code_no_phantoms = data_one_code_no_phantoms.drop_duplicates( + subset=["documents"], keep="last", ignore_index=True +) + +# %% +""" data_one_code_no_phantoms contains codes assigned by the LLM. Some of the codes were not present in the SOC codes list, and have been removed. +Those codes not neccessairly agree with codes initially assigned in ASHE dataset. +""" + +# %% +# data_one_code_no_phantoms.to_csv("soc_data/SOC_DIRECT_LOOKUP.csv") + +# %% +# data_one_code_no_phantoms.to_csv(f"{knowledge_bucket}SOC_DIRECT_LOOKUP.csv") + +# %% [markdown] +# # AGREEMENT + +# %% +"""Select a subset of codes, where LLM and ASHE assign the same code for a given job title. +""" + +# %% +msk_codable = data["codable"] + +# %% +data_codable = data[msk_codable] + +# %% +len(data_codable[data_codable["llm_soc_code"].str.len() > 1]) + +# %% +len(data_codable[(data_codable["llm_soc_code"].str.len() == 1)]) + +# %% +len( + data_codable[data_codable["llm_soc_code"].str.len() < 1] +) # expect 0 - if is codable, there should be a code available + +# %% +print(data_codable[data_codable["llm_soc_code"].str.len() == 1]) + +# %% +one_code_subset = data_codable[data_codable["llm_soc_code"].str.len() == 1] + +# %% +codes_with_agreement = one_code_subset[ + one_code_subset.apply(lambda r: str(r["label"]) in str(r["llm_soc_code"]), axis=1) +].reset_index(drop=True) + +# %% +soc_lookup = codes_with_agreement[["documents", "label"]] + +# %% +# save this once all is finished + +# %% [markdown] +# # One code from LLM - why disagreement? + +# %% +full_data_one_code = data[data["llm_soc_code"].str.len() == 1] + +# %% +one_code_disagreement = full_data_one_code[ + full_data_one_code.apply( + lambda r: str(r["label"]) not in str(r["llm_soc_candidates"]), axis=1 + ) +].reset_index(drop=True) + +# %% +print(one_code_disagreement) + +# %% +"""Look at the cases, where: +- LLM claims is codable ('codable' == True) +- ASHE does not agree with LLM ('label' != 'llm_soc_code') +- ASHE is one of the candidates selected by LLM ('label' in 'llm_soc_candidates') +""" + + +# %% +def get_candidates_list(row: pd.Series) -> list: + """Get a list of candidates determined by LLM. + + Args: + row: pd.Series: row with LLM output + + Returns: + list: lsit of candidates. + """ + candidates = [] + for i in row["llm_soc_candidates"]: + candidates.append(i["soc_code"]) + return candidates + + +# %% +ashe_llm_disagreement_multi_candidate = one_code_disagreement[ + one_code_disagreement["llm_soc_candidates"].str.len() > 1 +].reset_index(drop=True) + +# %% +ashe_llm_disagreement_multi_candidate.loc[:, "candidate_list"] = ( + ashe_llm_disagreement_multi_candidate.apply(get_candidates_list, axis=1) +) + +# %% +ashe_in_canidates = ashe_llm_disagreement_multi_candidate[ + ashe_llm_disagreement_multi_candidate.apply( + lambda r: str(r["label"]) in str(r["candidate_list"]), axis=1 + ) +] + +# %% +print( + f"""There is {len(ashe_in_canidates)} rows, where code determined by ASHE appears in the cadnidates from LLM, when LLM assessed the job title is codable.""" +) + +# %% [markdown] +# # How many of the rows that are in SOC INDEX have agreement/don't have agreement with ASHE + +# %% +# Access SOC_INDEX data +soc_coding_index_file = ( + f"{knowledge_bucket}soc2020volume2thecodingindexexcel03122025.xlsx" +) + + +# %% +def load_soc_index(filepath: str) -> pd.DataFrame: + """Load SOC index. + Provides a list of over 32,000 titles associated with employment. + + Args: + filepath (str): A path to the file containing SOC Index. + + Returns: + pd.DataFrame: A DataFrame with transformed job titles. + """ + soc_index_df = pd.read_excel( + filepath, + sheet_name="SOC2020 coding index", + usecols=["SOC_2020", "INDEXOCC-natural_word_order", "ADD", "IND"], + dtype=str, + ) + + soc_index_df.columns = [col.lower() for col in soc_index_df.columns] + + soc_index_df = soc_index_df.rename( + columns={"indexocc-natural_word_order": "indexocc", "soc_2020": "code"} + ) + + soc_index_df = soc_index_df[soc_index_df["code"] != "}}}}"] + soc_index_df = soc_index_df.dropna(subset=["code", "indexocc"]) + soc_index_df["title"] = soc_index_df.apply(combine_job_title, axis=1) + soc_index_df = soc_index_df[["code", "title"]] + soc_index_df["title"] = soc_index_df["title"].str.capitalize() + + return soc_index_df + + +# %% +soc_list = load_soc_index(soc_coding_index_file) + +# %% +soc_list["title"] = soc_list["title"].str.upper() + +# %% +titles_list = soc_list["title"] +titles_list = titles_list.to_list() + +# %% +# get subset of the ASHE data that comes from soc_index +in_list = data[data["documents"].isin(titles_list)].reset_index(drop=True) + +# %% +in_list_codable = in_list[in_list["codable"]] + +# %% +in_list_codable_disagreement = in_list_codable[ + in_list_codable.apply( + lambda r: str(r["label"]) not in str(r["llm_soc_candidates"]), axis=1 + ) +].reset_index(drop=True) +in_list_codable_agreement = in_list_codable[ + in_list_codable.apply( + lambda r: str(r["label"]) in str(r["llm_soc_candidates"]), axis=1 + ) +].reset_index(drop=True) + +# %% +in_list_codable_disagreement_one_code = in_list_codable_disagreement[ + in_list_codable_disagreement["llm_soc_code"].str.len() == 1 +] +in_list_codable_agreement_one_code = in_list_codable_agreement[ + in_list_codable_agreement["llm_soc_code"].str.len() == 1 +] + +# %% +soc_lookup = ( + pd.concat([soc_lookup, in_list_codable_agreement_one_code[["documents", "label"]]]) + .drop_duplicates(subset=["documents", "label"]) + .reset_index(drop=True) +) + +# %% [markdown] +# # LLM candidates - high likelihood (0.9/0.7) + +# %% +data_multiple_candidates = data[data["llm_soc_candidates"].str.len() > 1].reset_index( + drop=True +) + + +# %% +def get_high_candidate(row: pd.Series) -> str: + """Get a most likely candidate with likelihood greater than 0.9 (assessed by the LLM), + where only one candidate got that score. + + Args: + row: pd.Series: row with LLM output + + Returns: + str: most likely candidate. + """ + high_likelihood = [] + for i in row["llm_soc_candidates"]: + if i["likelihood"] >= 0.9: # noqa: PLR2004 + high_likelihood.append(i) + if len(high_likelihood) != 1: + return None + return high_likelihood[0]["soc_code"] + + +# %% +def get_high_candidate_with_low_other(row: pd.Series) -> str: + """Get a most likely candidate with likelihood greater than 0.9 (assessed by the LLM), + where only one candidate got that score, and no other candidates got likelihood score above 0.7. + + Args: + row: pd.Series: row with LLM output + + Returns: + str: most likely candidate. + """ + high_likelihood, lower_likelihood = [], [] + + for i in row["llm_soc_candidates"]: + if i["likelihood"] >= 0.9: # noqa: PLR2004 + high_likelihood.append(i) + elif i["likelihood"] >= 0.7: # noqa: PLR2004 + lower_likelihood.append(i) + + if len(high_likelihood) != 1 or len(lower_likelihood) > 0: + return None + return high_likelihood[0]["soc_code"] + + +# %% +data_multiple_candidates.loc[:, "most_likely_candidate"] = ( + data_multiple_candidates.apply(get_high_candidate, axis=1) +) + +# %% +print(len(data_multiple_candidates)) + +# %% +data_high_likelihood = data_multiple_candidates[ + data_multiple_candidates["most_likely_candidate"].notna() +] + +# %% +data_high_likelihood_agreement = data_high_likelihood[ + data_high_likelihood.apply( + lambda r: str(r["label"]) == str(r["most_likely_candidate"]), axis=1 + ) +].reset_index(drop=True) + +# %% +print(data_high_likelihood_agreement.iloc[0]) + +# %% +misspelled = 0 +for k in data_high_likelihood_agreement["reasoning"]: + # print(k) + if "misspelling" in k or "misspelled" in k: + misspelled += 1 + +# %% +print(misspelled) + +# %% +print( + f"""We looked at the rows, where the LLM decided thre is more than one possible SOC code candidate {len(data_multiple_candidates)} codes ({round((len(data_multiple_candidates) / len(data)) * 100, 2)}% of all codes). +To that subset of data, we added a new column 'most_likely_candidate'. +It was populated with codes, that were assessed to have a high (0.9) likelihood. +If the LLM assigned more than one code with high likelihood, those were disregarded, +as there is no way to determine which code is more likely, according to the LLM, meaning it is not unambiguous. + +Only one candidate with 0.9 likelihood was assigned for {len(data_high_likelihood)} rows. + +Next, we compared the agreement between the label assigned in the original data with the "most_likely_candidate", +which resulted in {len(data_high_likelihood_agreement)} cases. +""" +) + +# %% +data_high = data[data["llm_soc_candidates"].str.len() > 1].reset_index(drop=True) + +# %% +data_high.loc[:, "most_likely_candidate"] = data_high.apply( + get_high_candidate_with_low_other, axis=1 +) + +# %% +data_high_notna = data_high[data_high["most_likely_candidate"].notna()] + +# %% +data_high_notna_agreement = data_high_notna[ + data_high_notna.apply( + lambda r: str(r["label"]) == str(r["most_likely_candidate"]), axis=1 + ) +].reset_index(drop=True) + +# %% +soc_lookup = ( + pd.concat([soc_lookup, data_high_notna_agreement[["documents", "label"]]]) + .drop_duplicates(subset=["documents", "label"]) + .reset_index(drop=True) +) + +# %% +# soc_lookup.to_csv(f"{knowledge_bucket}wip_data/SOC_DIRECT_LOOKUP.csv") + +# %% +# data_one_code_no_phantoms +len(soc_lookup) + +# %% +soc_copy = ( + pd.concat([soc_lookup, data_one_code_no_phantoms[["documents", "label"]]]) + .drop_duplicates(subset=["documents"]) + .reset_index(drop=True) +) + +# %% +len(soc_copy) From cf5cb1fbd508dd73162fd8494a7cbc5cdbec0c98 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Tue, 16 Jun 2026 08:26:28 +0000 Subject: [PATCH 21/24] Revert "add level of education to the prompt" This reverts commit 88337c01e478613c672ca5f4d56f475ab198c0ce. --- src/occupational_classification_utils/llm/prompt.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py index 3955d04..a289e0c 100644 --- a/src/occupational_classification_utils/llm/prompt.py +++ b/src/occupational_classification_utils/llm/prompt.py @@ -104,7 +104,6 @@ - Company's main activity: {industry_descr} - Job Title: {job_title} - Job Description: {job_description} -- Level of Education: {level_of_education} ===Relevant subset of UK SOC 2020=== {soc_index} From 29c57121ff8bcac5ee7244746d0189cb246c7706 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Tue, 16 Jun 2026 08:29:23 +0000 Subject: [PATCH 22/24] Revert "create soc lookup" This reverts commit 07c23f14d0c7dc40e402c2b272fb52b20ec7d7d5. --- notebooks/create_soc_lookup_2026_04.py | 673 ------------------------- 1 file changed, 673 deletions(-) delete mode 100644 notebooks/create_soc_lookup_2026_04.py diff --git a/notebooks/create_soc_lookup_2026_04.py b/notebooks/create_soc_lookup_2026_04.py deleted file mode 100644 index 2b246f3..0000000 --- a/notebooks/create_soc_lookup_2026_04.py +++ /dev/null @@ -1,673 +0,0 @@ -# %% -# pylint: disable=C0103, C0114, C0301, R0801, W0105 - -"""Noetbook attempting to create a SOC DIRECT LOOKUP. - -Diasbling duplicate code - methods needs to be changed in other repos to reflect the change in data. -Diasbling line-too-long: commentary and discussion. -Disabling pointless-string-statement: comments to the code for reading clarity. -""" - -# %% -import ast -import re - -import dotenv -import pandas as pd - -# %% -from occupational_classification.data_access.soc_data_access import ( - _combine_soc_index_job_title as combine_job_title, -) - -# %% -input_folder = "soc_data" - -file_name = "ashe_llm_soc_codes" - -file_suffix = "_2026_05_19" - -# %% -# read the data -data = pd.read_csv(f"notebooks/{input_folder}/{file_name}{file_suffix}.csv") - -# %% -# use only columns needed -data = data[ - [ - "documents", - # "corrected_spelling", - "label", - "codable", - "llm_soc_code", - "llm_soc_candidates", - "reasoning", - ] -] - - -# %% -def parse_string(text): - """Convert string to a list of dictionaries for SOC candidates.""" - if isinstance(text, str): - processed = text.replace("SocCandidate(", "dict(") - processed = re.sub(r"(\w+)=", r'"\1":', processed) - processed = processed.replace("dict(", "{").replace(")", "}") - return ast.literal_eval(processed) - return [] - - -# %% -# string to list of dictionaries -data["llm_soc_candidates"] = data["llm_soc_candidates"].map(parse_string) - -# %% -print(f"llm {data['codable'].value_counts()}") - - -# %% -def access_soc_code_from_candidate_list(row_values: list[dict]) -> list[str]: - """From list of potential SOC candidates, access SOC codes. - - Args: - row_values (list[dict]): list of dictionaries with SOC candidates. - - Return: - candidates (list[str]): list of 4-digit candidate codes. - """ - if isinstance(row_values, list): - candidates = [] - for row in row_values: - if len(row) < 1: - return None - candidates.append(row.get("soc_code")) - else: - return None - return candidates - - -# %% -def float_to_list_of_codes(row_values: float) -> str: - """Convert float to a string of codes (str). - - Args: - row_values (float): SOC code as a float. - - Return: - row_values (str): SOC code as a string. - """ - if isinstance(row_values, float): - codes_list = [f"{row_values:.0f}"] - print(type(codes_list)) - return codes_list - return [row_values] - - -# %% -data["label"] = data["label"].astype(str) - -# %% -msk = data["llm_soc_code"].isna() # take rows, where LLM didn't provide a code. - -# %% -data.loc[~msk, "llm_soc_code"] = data.loc[~msk, "llm_soc_code"].apply( - float_to_list_of_codes -) - -# %% -data.loc[msk, "llm_soc_code"] = data.loc[msk, "llm_soc_candidates"].apply( - access_soc_code_from_candidate_list -) - - -# %% -def check_agreement(df: pd.DataFrame, df_source: str): - """Checks agreement between ASHE and LLM assigned codes. - - Args: - df (pd.DataFrame): dataframe containing columns 'label' and 'llm_soc_code' with codes. - df_source (str): String indicaitng the source of the dataframe (ASHE or soc index). - """ - agr, in_cand = 0, 0 - # check if 'label' is the same as 'llm_soc_code'. - # If LLM uncodable, check if 'label' in candidates. - for row in range(len(df)): - if len(df.iloc[row]["llm_soc_code"]) == 1: - agr += df.iloc[row]["label"] == df.iloc[row]["llm_soc_code"][0] - df.loc[row, "codable"] = True - elif len(df.iloc[row]["llm_soc_code"]) > 1: - in_cand += df.iloc[row]["label"] in df.iloc[row]["llm_soc_code"] - - print( - f"Agreement full {df_source}: {agr} ({round(agr / len(df), 2) * 100}% of all rows)" - ) - print( - f"Agreement (code in candidates) {df_source}: {in_cand} ({round(in_cand / len(df), 2) * 100}% of all rows)" # pylint: disable=C0301 - ) - print( - f"Agreement (label the same or within candidates) {df_source}: {agr + in_cand} ({round((agr + in_cand) / len(df), 2) * 100}% of all rows)" # pylint: disable=C0301 - ) - - -# %% -check_agreement(data, "ASHE and LLM") - -# %% -len(data) - - -# %% -def check_code_count(df: pd.DataFrame, df_source: str): - """Check if the LLM assigned a sigle, multiple, or none codes when assessing SOC codes. - - Args: - df (pd.DataFrame): dataframe containing LLM assessment of SOC codes. - Requires 'llm_soc_code' column. - df_source (str): String indicaitng the source of the dataframe (ASHE or soc index). - """ - longer, shorter, one_code = 0, 0, 0 - for code in df["llm_soc_code"]: - if isinstance(code, list): - if len(code) > 1: - longer += 1 - if len(code) < 1: - shorter += 1 - if len(code) == 1: - one_code += 1 - else: - one_code += 1 - - print( - f"More than one code {df_source}: {longer} ({round(longer / len(df) * 100, 2)}%)" - ) - print( - f"No codes assigned {df_source}: {shorter} ({round(shorter / len(df) * 100, 2)}%)" - ) - print( - f"One code assigned {df_source}: {one_code} ({round(one_code / len(df) * 100, 2)}%)" - ) - - -# %% -check_code_count(data, "ASHE") - -# %% -full_data_codable = data[data["codable"]] - -# %% -data_only_columns = data[["documents", "llm_soc_code"]] - -# %% -data_one_code = data_only_columns[data_only_columns["llm_soc_code"].str.len() == 1] - -# %% -e = data_one_code["llm_soc_code"].str[0] - -# %% -numeric = pd.to_numeric(e, errors="coerce") - -# %% -data_one_code["llm_soc_code"] = numeric - -# %% -data_one_code = data_one_code.dropna(subset=["llm_soc_code"]) - -# %% -data_one_code["llm_soc_code"] = data_one_code["llm_soc_code"].astype(int) - -# %% -data_one_code = data_one_code.rename( - columns={"corrected_spelling": "documents", "llm_soc_code": "label"} -) - -# %% -data_one_code = data_one_code.drop_duplicates( - subset=["documents", "label"], keep="last", ignore_index=True -) - - -# %% -def load_soc_framework(filepath: str) -> pd.DataFrame: - """Load SOC structure. - - Provides structure with all levels and names of the SOC 2020. - - Args: - filepath (str): A path to the file containing SOC Structure. - - Returns: - pd.DataFrame: A DataFrame containing group code, group title, - group description, typical entry routes and associated qualifications, - and list of tasks. - """ - soc_df = pd.read_excel( - filepath, - sheet_name="SOC2020 framework", - usecols=[ - "SOC2020 Unit Group", - "SOC2020 Group Title", - ], - dtype=str, - ) - soc_df.columns = [ - col.lower().replace(" ", "_").replace("__", "_").replace("\n", "") - for col in soc_df.columns - ] - soc_df = soc_df.rename( - columns={"soc2020_unit_group": "code", "soc2020_group_title": "title"} - ) - - for col in soc_df.columns: - soc_df[col] = soc_df[col].str.strip() - - return soc_df - - -# %% -knowledge_bucket = dotenv.get_key(".env", "KNOWLEDGE_BUCKET") - -# %% -s_list = load_soc_framework( - f"{knowledge_bucket}soc2020volume2thecodingindexexcel03122025.xlsx" -) -s_list = s_list[s_list["code"].notna()] - -# %% -codes_from_framework_str = list(s_list["code"].value_counts().keys()) - -# %% -codes_from_framework_int = [] -for k in codes_from_framework_str: - codes_from_framework_int.append(int(k)) - -# %% -phantom_codes = ( - data_one_code[~data_one_code["label"].isin(codes_from_framework_int)]["label"] - .value_counts() - .keys() -) - -# %% -print("codes that don't appear in the SOC codes list\n", phantom_codes) - -# %% -data_one_code_no_phantoms = data_one_code[ - data_one_code["label"].isin(codes_from_framework_int) -] - -# %% -coded_all = len(data_one_code) - -# %% -coded_no_phantom = len(data_one_code_no_phantoms) - -# %% -diff = len(data_one_code) - len(data_one_code_no_phantoms) - -# %% -drop = diff / coded_all * 100 - -# %% -print( - f"with phantoms: {coded_all}\ncoded no phantoms: {coded_no_phantom}\ndiff: {diff}\ndrop(%): {drop:.2f}" # pylint: disable=C0301 -) - -# %% -print(data_one_code_no_phantoms) - -# %% -print( - "check if there is any duplicates\n", - data_one_code_no_phantoms[ - data_one_code_no_phantoms.duplicated(subset=["documents"]) - ], -) - -# %% -data_one_code_no_phantoms = data_one_code_no_phantoms.drop_duplicates( - subset=["documents"], keep="last", ignore_index=True -) - -# %% -""" data_one_code_no_phantoms contains codes assigned by the LLM. Some of the codes were not present in the SOC codes list, and have been removed. -Those codes not neccessairly agree with codes initially assigned in ASHE dataset. -""" - -# %% -# data_one_code_no_phantoms.to_csv("soc_data/SOC_DIRECT_LOOKUP.csv") - -# %% -# data_one_code_no_phantoms.to_csv(f"{knowledge_bucket}SOC_DIRECT_LOOKUP.csv") - -# %% [markdown] -# # AGREEMENT - -# %% -"""Select a subset of codes, where LLM and ASHE assign the same code for a given job title. -""" - -# %% -msk_codable = data["codable"] - -# %% -data_codable = data[msk_codable] - -# %% -len(data_codable[data_codable["llm_soc_code"].str.len() > 1]) - -# %% -len(data_codable[(data_codable["llm_soc_code"].str.len() == 1)]) - -# %% -len( - data_codable[data_codable["llm_soc_code"].str.len() < 1] -) # expect 0 - if is codable, there should be a code available - -# %% -print(data_codable[data_codable["llm_soc_code"].str.len() == 1]) - -# %% -one_code_subset = data_codable[data_codable["llm_soc_code"].str.len() == 1] - -# %% -codes_with_agreement = one_code_subset[ - one_code_subset.apply(lambda r: str(r["label"]) in str(r["llm_soc_code"]), axis=1) -].reset_index(drop=True) - -# %% -soc_lookup = codes_with_agreement[["documents", "label"]] - -# %% -# save this once all is finished - -# %% [markdown] -# # One code from LLM - why disagreement? - -# %% -full_data_one_code = data[data["llm_soc_code"].str.len() == 1] - -# %% -one_code_disagreement = full_data_one_code[ - full_data_one_code.apply( - lambda r: str(r["label"]) not in str(r["llm_soc_candidates"]), axis=1 - ) -].reset_index(drop=True) - -# %% -print(one_code_disagreement) - -# %% -"""Look at the cases, where: -- LLM claims is codable ('codable' == True) -- ASHE does not agree with LLM ('label' != 'llm_soc_code') -- ASHE is one of the candidates selected by LLM ('label' in 'llm_soc_candidates') -""" - - -# %% -def get_candidates_list(row: pd.Series) -> list: - """Get a list of candidates determined by LLM. - - Args: - row: pd.Series: row with LLM output - - Returns: - list: lsit of candidates. - """ - candidates = [] - for i in row["llm_soc_candidates"]: - candidates.append(i["soc_code"]) - return candidates - - -# %% -ashe_llm_disagreement_multi_candidate = one_code_disagreement[ - one_code_disagreement["llm_soc_candidates"].str.len() > 1 -].reset_index(drop=True) - -# %% -ashe_llm_disagreement_multi_candidate.loc[:, "candidate_list"] = ( - ashe_llm_disagreement_multi_candidate.apply(get_candidates_list, axis=1) -) - -# %% -ashe_in_canidates = ashe_llm_disagreement_multi_candidate[ - ashe_llm_disagreement_multi_candidate.apply( - lambda r: str(r["label"]) in str(r["candidate_list"]), axis=1 - ) -] - -# %% -print( - f"""There is {len(ashe_in_canidates)} rows, where code determined by ASHE appears in the cadnidates from LLM, when LLM assessed the job title is codable.""" -) - -# %% [markdown] -# # How many of the rows that are in SOC INDEX have agreement/don't have agreement with ASHE - -# %% -# Access SOC_INDEX data -soc_coding_index_file = ( - f"{knowledge_bucket}soc2020volume2thecodingindexexcel03122025.xlsx" -) - - -# %% -def load_soc_index(filepath: str) -> pd.DataFrame: - """Load SOC index. - Provides a list of over 32,000 titles associated with employment. - - Args: - filepath (str): A path to the file containing SOC Index. - - Returns: - pd.DataFrame: A DataFrame with transformed job titles. - """ - soc_index_df = pd.read_excel( - filepath, - sheet_name="SOC2020 coding index", - usecols=["SOC_2020", "INDEXOCC-natural_word_order", "ADD", "IND"], - dtype=str, - ) - - soc_index_df.columns = [col.lower() for col in soc_index_df.columns] - - soc_index_df = soc_index_df.rename( - columns={"indexocc-natural_word_order": "indexocc", "soc_2020": "code"} - ) - - soc_index_df = soc_index_df[soc_index_df["code"] != "}}}}"] - soc_index_df = soc_index_df.dropna(subset=["code", "indexocc"]) - soc_index_df["title"] = soc_index_df.apply(combine_job_title, axis=1) - soc_index_df = soc_index_df[["code", "title"]] - soc_index_df["title"] = soc_index_df["title"].str.capitalize() - - return soc_index_df - - -# %% -soc_list = load_soc_index(soc_coding_index_file) - -# %% -soc_list["title"] = soc_list["title"].str.upper() - -# %% -titles_list = soc_list["title"] -titles_list = titles_list.to_list() - -# %% -# get subset of the ASHE data that comes from soc_index -in_list = data[data["documents"].isin(titles_list)].reset_index(drop=True) - -# %% -in_list_codable = in_list[in_list["codable"]] - -# %% -in_list_codable_disagreement = in_list_codable[ - in_list_codable.apply( - lambda r: str(r["label"]) not in str(r["llm_soc_candidates"]), axis=1 - ) -].reset_index(drop=True) -in_list_codable_agreement = in_list_codable[ - in_list_codable.apply( - lambda r: str(r["label"]) in str(r["llm_soc_candidates"]), axis=1 - ) -].reset_index(drop=True) - -# %% -in_list_codable_disagreement_one_code = in_list_codable_disagreement[ - in_list_codable_disagreement["llm_soc_code"].str.len() == 1 -] -in_list_codable_agreement_one_code = in_list_codable_agreement[ - in_list_codable_agreement["llm_soc_code"].str.len() == 1 -] - -# %% -soc_lookup = ( - pd.concat([soc_lookup, in_list_codable_agreement_one_code[["documents", "label"]]]) - .drop_duplicates(subset=["documents", "label"]) - .reset_index(drop=True) -) - -# %% [markdown] -# # LLM candidates - high likelihood (0.9/0.7) - -# %% -data_multiple_candidates = data[data["llm_soc_candidates"].str.len() > 1].reset_index( - drop=True -) - - -# %% -def get_high_candidate(row: pd.Series) -> str: - """Get a most likely candidate with likelihood greater than 0.9 (assessed by the LLM), - where only one candidate got that score. - - Args: - row: pd.Series: row with LLM output - - Returns: - str: most likely candidate. - """ - high_likelihood = [] - for i in row["llm_soc_candidates"]: - if i["likelihood"] >= 0.9: # noqa: PLR2004 - high_likelihood.append(i) - if len(high_likelihood) != 1: - return None - return high_likelihood[0]["soc_code"] - - -# %% -def get_high_candidate_with_low_other(row: pd.Series) -> str: - """Get a most likely candidate with likelihood greater than 0.9 (assessed by the LLM), - where only one candidate got that score, and no other candidates got likelihood score above 0.7. - - Args: - row: pd.Series: row with LLM output - - Returns: - str: most likely candidate. - """ - high_likelihood, lower_likelihood = [], [] - - for i in row["llm_soc_candidates"]: - if i["likelihood"] >= 0.9: # noqa: PLR2004 - high_likelihood.append(i) - elif i["likelihood"] >= 0.7: # noqa: PLR2004 - lower_likelihood.append(i) - - if len(high_likelihood) != 1 or len(lower_likelihood) > 0: - return None - return high_likelihood[0]["soc_code"] - - -# %% -data_multiple_candidates.loc[:, "most_likely_candidate"] = ( - data_multiple_candidates.apply(get_high_candidate, axis=1) -) - -# %% -print(len(data_multiple_candidates)) - -# %% -data_high_likelihood = data_multiple_candidates[ - data_multiple_candidates["most_likely_candidate"].notna() -] - -# %% -data_high_likelihood_agreement = data_high_likelihood[ - data_high_likelihood.apply( - lambda r: str(r["label"]) == str(r["most_likely_candidate"]), axis=1 - ) -].reset_index(drop=True) - -# %% -print(data_high_likelihood_agreement.iloc[0]) - -# %% -misspelled = 0 -for k in data_high_likelihood_agreement["reasoning"]: - # print(k) - if "misspelling" in k or "misspelled" in k: - misspelled += 1 - -# %% -print(misspelled) - -# %% -print( - f"""We looked at the rows, where the LLM decided thre is more than one possible SOC code candidate {len(data_multiple_candidates)} codes ({round((len(data_multiple_candidates) / len(data)) * 100, 2)}% of all codes). -To that subset of data, we added a new column 'most_likely_candidate'. -It was populated with codes, that were assessed to have a high (0.9) likelihood. -If the LLM assigned more than one code with high likelihood, those were disregarded, -as there is no way to determine which code is more likely, according to the LLM, meaning it is not unambiguous. - -Only one candidate with 0.9 likelihood was assigned for {len(data_high_likelihood)} rows. - -Next, we compared the agreement between the label assigned in the original data with the "most_likely_candidate", -which resulted in {len(data_high_likelihood_agreement)} cases. -""" -) - -# %% -data_high = data[data["llm_soc_candidates"].str.len() > 1].reset_index(drop=True) - -# %% -data_high.loc[:, "most_likely_candidate"] = data_high.apply( - get_high_candidate_with_low_other, axis=1 -) - -# %% -data_high_notna = data_high[data_high["most_likely_candidate"].notna()] - -# %% -data_high_notna_agreement = data_high_notna[ - data_high_notna.apply( - lambda r: str(r["label"]) == str(r["most_likely_candidate"]), axis=1 - ) -].reset_index(drop=True) - -# %% -soc_lookup = ( - pd.concat([soc_lookup, data_high_notna_agreement[["documents", "label"]]]) - .drop_duplicates(subset=["documents", "label"]) - .reset_index(drop=True) -) - -# %% -# soc_lookup.to_csv(f"{knowledge_bucket}wip_data/SOC_DIRECT_LOOKUP.csv") - -# %% -# data_one_code_no_phantoms -len(soc_lookup) - -# %% -soc_copy = ( - pd.concat([soc_lookup, data_one_code_no_phantoms[["documents", "label"]]]) - .drop_duplicates(subset=["documents"]) - .reset_index(drop=True) -) - -# %% -len(soc_copy) From be20c2ed0644c980737afbeb522d041860fd35a5 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Tue, 16 Jun 2026 08:31:37 +0000 Subject: [PATCH 23/24] add level of education --- src/occupational_classification_utils/llm/prompt.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py index a289e0c..3955d04 100644 --- a/src/occupational_classification_utils/llm/prompt.py +++ b/src/occupational_classification_utils/llm/prompt.py @@ -104,6 +104,7 @@ - Company's main activity: {industry_descr} - Job Title: {job_title} - Job Description: {job_description} +- Level of Education: {level_of_education} ===Relevant subset of UK SOC 2020=== {soc_index} From e8c5773b4b292ae38635c87acb16e939a1107969 Mon Sep 17 00:00:00 2001 From: peter-spencer-ons Date: Mon, 22 Jun 2026 18:55:31 +0000 Subject: [PATCH 24/24] include level_of_education in two prompt pipeline --- src/occupational_classification_utils/llm/llm.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py index c4390e1..aa7cc83 100644 --- a/src/occupational_classification_utils/llm/llm.py +++ b/src/occupational_classification_utils/llm/llm.py @@ -289,6 +289,11 @@ async def unambiguous_soc_code( # noqa: PLR0913 if (job_description is None or job_description in {"", " "}) else job_description ) + level_of_education = ( + "Unknown" + if (level_of_education is None or level_of_education in {"", " "}) + else level_of_education + ) call_dict = { "industry_descr": industry_descr,