From 0ffe78232f91c18bc9feea8ad3f8567c659f32e1 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Mon, 11 May 2026 12:28:07 +0000
Subject: [PATCH 01/24] Add prompt and validation model for initial SOC
 classificaiton

---
 .../llm/prompt.py                             | 50 ++++++++++++++
 .../models/response_model.py                  | 68 +++++++++++++++++++
 2 files changed, 118 insertions(+)

diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py
index af71a4f..f7ca4d1 100644
--- a/src/occupational_classification_utils/llm/prompt.py
+++ b/src/occupational_classification_utils/llm/prompt.py
@@ -30,6 +30,7 @@
 from occupational_classification_utils.models.response_model import (
     RagResponse,
     SocResponse,
+    UnambiguousResponse,
 )
 
 config = get_config()
@@ -150,6 +151,55 @@
     },
 )
 
+
+_soc_template_unambiguous = """"You are an expert in ocucpational classifications.
+You are tasked with determining whether a survey response can be assigned to a
+single 4-digit UK Standard Occupational Classification (SOC) code based on initial respondent data alone.
+
+Key objective:  Determine if the response can be coded unambiguously to a single 4-digit SOC code.
+
+Assignment logic:
+1. Code as unambiguous when response can be coded to a single 4-digit SOC code with 99
+per cent confidence based on available evidence.
+2. Code as uncodable to 4-digit when multiple candidates are plausible and
+additional information is needed to distinguish between them.
+
+===Analysis steps===
+Follow these steps in order:
+1. Review each candidate from the shortlist of relevant SOC codes against the respondent data.
+2. Assess alignment - Consider:
+   - Semantic similarity between respondent descriptions and SOC code descriptions
+   - Job role compatibility with typical activities in each SOC code
+   - Industry context alignment
+   - Matches with specific examples listed under each code.
+3. Assign confidence scores - Rate each candidate from 0.1 (least likely) to 0.9 (most likely).
+4. Decide if response can be codeded unambiguously to a single 4-digit SOC code with 99 per cent confidence.
+5. Provide reasoning for your decision.
+
+===Respondent Data===
+- Industry description: {industry_descr}
+- Job Title: {job_title}
+- Job Description: {job_description}
+- Level of Education: {level_of_education}
+
+===Shortlist===
+{soc_candidates}
+
+===Output Format===
+{format_instructions}
+"""
+
+parser_unambiguous = PydanticOutputParser(  # type: ignore # Suspect langchain ver bug
+    pydantic_object=UnambiguousResponse
+)
+
+SOC_PROMPT_UNAMBIGUOUS = PromptTemplate.from_template(
+    template=_core_prompt + _soc_template_unambiguous,
+    partial_variables={
+        "format_instructions": parser_unambiguous.get_format_instructions(),
+    },
+)
+
 FIX_PARSING_PROMPT = PromptTemplate.from_template(
     """You are a meticulous assistant tasked with ensuring that
 the output from a language model adheres strictly to the required JSON format.
diff --git a/src/occupational_classification_utils/models/response_model.py b/src/occupational_classification_utils/models/response_model.py
index 87da5e6..994f765 100644
--- a/src/occupational_classification_utils/models/response_model.py
+++ b/src/occupational_classification_utils/models/response_model.py
@@ -271,3 +271,71 @@ class SurveyAssistSocResponse(BaseModel):
         selected. Specifies the information used to assign the SOC code or any
         additional information required to assign a SOC code.""",
     )
+
+
+class UnambiguousResponse(BaseModel):
+    """Represents a response model for classification code assignment.
+
+    Attributes:
+        codable (bool): True only if enough information is provided to assign
+            an unambiguous single classification code, False otherwise.
+        class_code (Optional[str]): Full classification code (to the required number of digits)
+            assigned based on provided respondent's data. Must be present if codable=True,
+            must be None if codable=False.
+        class_descriptive (Optional[str]): Descriptive label of the classification category.
+            Must be present if codable=True, must be None if codable=False.
+        alt_candidates (list[RagCandidate]): Short list of possible classification codes with their
+            descriptive labels and estimated likelihoods.
+        reasoning (str): Step by step reasoning behind the classification selected.
+    """
+
+    codable: bool = Field(
+        description="True only if enough information is provided to decide an unambiguous "
+        "classification code, False otherwise."
+    )
+
+    class_code: Optional[str] = Field(
+        default=None,
+        description="Full classification code (to the required number of digits) "
+        "assigned based on provided respondent's data. Must be present if codable=True, "
+        "must be None if codable=False.",
+    )
+
+    class_descriptive: Optional[str] = Field(
+        default=None,
+        description="Descriptive label of the classification category. "
+        "Must be present if codable=True, must be None if codable=False.",
+    )
+
+    alt_candidates: list[RagCandidate] = Field(
+        default_factory=list,
+        description="Short list of possible classification codes with their "
+        "descriptive labels and estimated likelihoods.",
+        min_length=1,  # Ensure there's always at least one candidate
+        max_length=10,  # Limit to less than 10 candidates
+    )
+
+    reasoning: str = Field(
+        description="Step by step reasoning behind the classification selected.",
+        min_length=50,  # Ensure detailed reasoning is provided
+    )
+
+    @field_validator("alt_candidates")
+    @classmethod
+    def validate_alt_candidates(cls, v):
+        """Validates the number of alternative candidates.
+
+        Ensures that the number of candidates is between 1 and the maximum allowed.
+
+        Args:
+            v (list): The list of alternative candidates.
+
+        Returns:
+            list: The validated list of candidates.
+
+        Raises:
+            ValueError: If the number of candidates is not within the allowed range.
+        """
+        if not 1 <= len(v) <= MAX_ALT_CANDIDATES:
+            raise ValueError("alt_candidates must contain between 1 and 10 items.")
+        return v
\ No newline at end of file

From 1db4e8197838aeb859a055127708389d783e9307 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Mon, 11 May 2026 15:07:43 +0000
Subject: [PATCH 02/24] add llm method for initial SOC classification

---
 .../llm/llm.py                                | 169 +++++++++++++++++-
 1 file changed, 168 insertions(+), 1 deletion(-)

diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py
index 8a93597..f809d1e 100644
--- a/src/occupational_classification_utils/llm/llm.py
+++ b/src/occupational_classification_utils/llm/llm.py
@@ -15,6 +15,7 @@
     (None at the module level)
 """
 
+import time
 from collections import defaultdict
 from functools import lru_cache
 from typing import Any, Optional, Union
@@ -37,8 +38,9 @@
     FIX_PARSING_PROMPT,
     SA_SOC_PROMPT_RAG,
     SOC_PROMPT_PYDANTIC,
+    SOC_PROMPT_UNAMBIGUOUS,
 )
-from occupational_classification_utils.models.response_model import SocResponse
+from occupational_classification_utils.models.response_model import SocResponse, UnambiguousResponse
 
 logger = get_logger(__name__)
 config = get_config()
@@ -103,6 +105,7 @@ def __init__(  # noqa: PLR0913
         self.soc_meta = get_soc_meta(config["lookups"]["soc_structure"])
         self.soc_prompt = SOC_PROMPT_PYDANTIC
         self.sa_soc_prompt_rag = SA_SOC_PROMPT_RAG
+        self.soc_prompt_unambiguous = SOC_PROMPT_UNAMBIGUOUS
         self.soc: Optional[SOC] = None
         self.verbose = verbose
 
@@ -399,3 +402,167 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes):
                 )
 
         return validated_answer, short_list, call_dict
+
+
+    async def unambiguous_soc_code(  # noqa: PLR0913
+        self,
+        industry_descr: str,
+        semantic_search_results: list[dict],
+        job_title: Optional[str] = None,
+        job_description: Optional[str] = None,
+        level_of_education: Optional[str] = None,
+        candidates_limit: int = config["llm"]["candidates_limit"],
+        code_digits: int = config["llm"]["code_digits"],
+        correlation_id: Optional[str] = None,
+    ) -> tuple[UnambiguousResponse, Optional[Any]]:
+        """Evaluates codability to a single 4-digit SOC code based on respondent's data.
+
+        Args:
+            industry_descr (str): The description of the industry.
+            semantic_search_results (list of dicts): List of semantic search results.
+            job_title (str, optional): The job title. Defaults to None.
+            job_description (str, optional): The job description. Defaults to None.
+            level_of_education (str, optional): The level od education. Defaults to None.
+            candidates_limit (int, optional): The maximum number of candidates
+                to include in the prompt. Defaults to 5.
+            code_digits (int, optional): The number of digits to consider from
+                the code for filtering candidates. Defaults to 5.
+            correlation_id (str, optional): Optional correlation ID for request tracking.
+
+        Returns:
+            UnambiguousResponse: The generated response to the query.
+
+        Raises:
+            ValueError: If there is an error during the parsing of the response.
+            ValueError: If the default embedding handler is required but
+                not loaded correctly.
+
+        """
+        soc_candidates = self._prompt_candidate_list(
+            short_list=semantic_search_results,
+            code_digits=code_digits,
+            candidates_limit=candidates_limit,
+        )
+
+        job_title = (
+            "Unknown" if (job_title is None or job_title in {"", " "}) else job_title
+        )
+        job_description = (
+            "Unknown"
+            if (job_description is None or job_description in {"", " "})
+            else job_description
+        )
+        level_of_education = (
+            "Unknown" if (level_of_education is None or level_of_education in {"", " "}) else level_of_education
+        )
+
+        call_dict = {
+            "industry_descr": industry_descr,
+            "job_title": job_title,
+            "job_description": job_description,
+            "level_of_education": level_of_education,
+            "soc_candidates": soc_candidates,
+        }
+
+        if self.verbose:
+            final_prompt = self.soc_prompt_unambiguous.format(**call_dict)
+            logger.debug(final_prompt)
+
+        chain = self.soc_prompt_unambiguous | self.llm
+
+        # Log LLM request sent # Not logging yet - needs to create/import truncate_identifier.
+        # logger.info(
+        #     "LLM request sent - unambiguous_sic_code",
+        #     job_title=truncate_identifier(job_title),
+        #     job_description=truncate_identifier(job_description),
+        #     level_of_education=truncate_identifier(level_of_education),
+        #     industry_descr=truncate_identifier(industry_descr),
+        #     correlation_id=correlation_id or "",
+        # )
+        llm_start = time.perf_counter()
+
+        try:
+            response = await chain.ainvoke(call_dict, return_only_outputs=True)
+        except ValueError as err:
+            logger.error(
+                f"Error from chain, exit early: {err}",
+                error=str(err),
+                correlation_id=correlation_id or "",
+            )
+            validated_answer = UnambiguousResponse(
+                codable=False,
+                alt_candidates=[],
+                reasoning="Error from chain, exit early",
+            )
+            return validated_answer, call_dict
+
+        if self.verbose:
+            logger.debug(f"llm_response={response}")
+
+        # Parse the output to the desired format
+        parser = PydanticOutputParser(pydantic_object=UnambiguousResponse)  # type: ignore
+        try:
+            validated_answer = parser.parse(str(response.content))
+            # Log LLM response received after successful parse
+            alt_candidates_count = len(
+                getattr(validated_answer, "alt_candidates", []) or []
+            )
+            codable = bool(getattr(validated_answer, "codable", False))
+            selected_code = (
+                str(getattr(validated_answer, "class_code", "")) if codable else ""
+            )
+            llm_duration_ms = int((time.perf_counter() - llm_start) * 1000)
+            logger.info(
+                "LLM response received for unambiguous sic prompt",
+                codable=str(codable),
+                selected_code=selected_code,
+                alt_candidates_count=str(alt_candidates_count),
+                duration_ms=str(llm_duration_ms),
+                correlation_id=correlation_id or "",
+            )
+        except (ValueError, AttributeError) as parse_error:
+            logger.error(
+                f"Failed to parse response: {parse_error}",
+                error=str(parse_error),
+                correlation_id=correlation_id or "",
+            )
+            llm_duration_ms = int((time.perf_counter() - llm_start) * 1000)
+            logger.warning(
+                "Failed to parse response",
+                response_content=str(response.content),
+                duration_ms=str(llm_duration_ms),
+                correlation_id=correlation_id or "",
+            )
+
+            # send another llm request to fix the format (1 attempt)
+            try:
+                chain = FIX_PARSING_PROMPT | self.llm
+                response = await chain.ainvoke(
+                    {
+                        "llm_output": str(response.content),
+                        "format_instructions": parser.get_format_instructions(),
+                    },
+                    return_only_outputs=True,
+                )
+                validated_answer = parser.parse(str(response.content))
+                logger.debug("Successfully parsed reformatted response.")
+
+            except (ValueError, AttributeError) as parse_error2:
+                logger.error(
+                    f"Failed to parse response again: {parse_error2}",
+                    error=str(parse_error2),
+                )
+                logger.warning(
+                    "Failed to parse response again",
+                    response_content=str(response.content),
+                )
+                reasoning = (
+                    f"ERROR parse_error=<{parse_error2}>, response=<{response.content}>"
+                )
+                validated_answer = UnambiguousResponse(
+                    codable=False,
+                    alt_candidates=[],
+                    reasoning=reasoning,
+                )
+
+        return validated_answer, call_dict
\ No newline at end of file

From b54238d0fc46810b96c7fbd5e93b291957671f70 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Tue, 12 May 2026 15:29:59 +0000
Subject: [PATCH 03/24] add llm method, prompt and validation for stage 3

---
 .../llm/llm.py                                | 164 ++++++++++++++++++
 .../llm/prompt.py                             |  68 ++++++++
 .../models/response_model.py                  |  24 ++-
 3 files changed, 255 insertions(+), 1 deletion(-)

diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py
index f809d1e..6110f53 100644
--- a/src/occupational_classification_utils/llm/llm.py
+++ b/src/occupational_classification_utils/llm/llm.py
@@ -39,6 +39,7 @@
     SA_SOC_PROMPT_RAG,
     SOC_PROMPT_PYDANTIC,
     SOC_PROMPT_UNAMBIGUOUS,
+    SOC_PROMPT_OPENFOLLOWUP,
 )
 from occupational_classification_utils.models.response_model import SocResponse, UnambiguousResponse
 
@@ -106,6 +107,7 @@ def __init__(  # noqa: PLR0913
         self.soc_prompt = SOC_PROMPT_PYDANTIC
         self.sa_soc_prompt_rag = SA_SOC_PROMPT_RAG
         self.soc_prompt_unambiguous = SOC_PROMPT_UNAMBIGUOUS
+        self.soc_prompt_openfollowup = SOC_PROMPT_OPENFOLLOWUP
         self.soc: Optional[SOC] = None
         self.verbose = verbose
 
@@ -565,4 +567,166 @@ async def unambiguous_soc_code(  # noqa: PLR0913
                     reasoning=reasoning,
                 )
 
+        return validated_answer, call_dict
+
+    async def formulate_open_question(
+        self,
+        industry_descr: str,
+        job_title: str | None = None,
+        job_description: str | None = None,
+        level_of_education: str | None = None,
+        llm_output: SicCandidate | None = None,
+        correlation_id: str | None = None,
+    ) -> tuple[OpenFollowUp, Any]:
+        """Formulates an open-ended question using respondent data and survey design guidelines.
+
+        Args:
+            industry_descr (str): The description of the industry.
+            job_title (str, optional): The job title. Defaults to None.
+            job_description (str, optional): The job description. Defaults to None.
+            level_of_education (str, optional): The level od education. Defaults to None.
+            llm_output (SicCandidate, optional): The response from the LLM model.
+            correlation_id (str, optional): Optional correlation ID for request tracking.
+
+        Returns:
+            OpenFollowUp: The generated response to the query.
+
+        Raises:
+            ValueError: If there is an error during the parsing of the response.
+            ValueError: If the default embedding handler is required but
+                not loaded correctly.
+
+        """
+
+        def prep_call_dict(industry_descr, job_title, job_description, level_of_education, llm_output):
+            # Helper function to prepare the call dictionary
+            is_job_title_present = job_title is None or job_title in {"", " "}
+            job_title = "Unknown" if is_job_title_present else job_title
+
+            is_job_description_present = job_description is None or job_description in {
+                "",
+                " ",
+            }
+            job_description = (
+                "Unknown" if is_job_description_present else job_description
+            )
+            level_of_education = (
+                "Unknown" if (level_of_education is None or level_of_education in {"", " "}) else level_of_education
+            )
+
+            call_dict = {
+                "industry_descr": industry_descr,
+                "job_title": job_title,
+                "job_description": job_description,
+                "level_of_education": level_of_education,
+                "llm_output": str(llm_output),
+            }
+            return call_dict
+
+        call_dict = prep_call_dict(
+            industry_descr=industry_descr,
+            job_title=job_title,
+            job_description=job_description,
+            level_of_education=level_of_education,
+            llm_output=llm_output,
+        )
+
+        if self.verbose:
+            final_prompt = self.soc_prompt_openfollowup.format(**call_dict)
+            logger.debug(final_prompt)
+
+        chain = self.soc_prompt_openfollowup | self.llm
+
+        # Log LLM request sent # Not logging yet - needs to create/import truncate_identifier.
+        # logger.info(
+        #     "LLM request sent - formulate_open_question",
+        #     job_title=truncate_identifier(job_title),
+        #     job_description=truncate_identifier(job_description),
+        #     level_of_education=truncate_identifier(level_of_education),
+        #     industry_descr=truncate_identifier(industry_descr),
+        #     correlation_id=correlation_id or "",
+        # )
+        llm_start = time.perf_counter()
+
+        try:
+            response = await chain.ainvoke(call_dict, return_only_outputs=True)
+        except (ValueError, AttributeError) as err:
+            logger.error(
+                f"Error from LLMChain, exit early: {err}",
+                error=str(err),
+                correlation_id=correlation_id or "",
+            )
+            logger.warning(
+                "Error from LLMChain, exit early",
+                correlation_id=correlation_id or "",
+            )
+            validated_answer = OpenFollowUp(
+                followup=None,
+                reasoning="Error from LLMChain, exit early",
+            )
+            return validated_answer, call_dict
+
+        llm_duration_ms = int((time.perf_counter() - llm_start) * 1000)
+
+        # Parse the output to the desired format
+        parser = PydanticOutputParser(pydantic_object=OpenFollowUp)
+        try:
+            validated_answer = parser.parse(str(response.content))
+            # Log LLM response received after successful parse
+            has_followup = bool(getattr(validated_answer, "followup", None))
+            logger.info(
+                "LLM response received for open question prompt",
+                has_followup=str(has_followup),
+                duration_ms=str(llm_duration_ms),
+                correlation_id=correlation_id or "",
+            )
+        except (ValueError, AttributeError) as parse_error:
+            logger.error(
+                f"Failed to parse response: {parse_error}",
+                error=str(parse_error),
+                correlation_id=correlation_id or "",
+            )
+            logger.warning(
+                "Failed to parse response",
+                response_content=str(response.content),
+                correlation_id=correlation_id or "",
+            )
+            logger.info(
+                "LLM response received for open question prompt",
+                has_followup="False",
+                duration_ms=str(llm_duration_ms),
+                correlation_id=correlation_id or "",
+            )
+            try:
+                chain = FIX_PARSING_PROMPT | self.llm
+                response = await chain.ainvoke(
+                    {
+                        "llm_output": str(response.content),
+                        "format_instructions": parser.get_format_instructions(),
+                    },
+                    return_only_outputs=True,
+                )
+                validated_answer = parser.parse(str(response.content))
+                logger.debug("Successfully parsed reformatted response.")
+
+            except (ValueError, AttributeError) as parse_error2:
+                logger.error(
+                    f"Failed to parse response again: {parse_error2}",
+                    error=str(parse_error2),
+                )
+                logger.warning(
+                    "Failed to parse response again",
+                    response_content=str(response.content),
+                )
+                reasoning = (
+                    f"ERROR parse_error=<{parse_error2}>, response=<{response.content}>"
+                )
+                validated_answer = OpenFollowUp(
+                    followup=None,
+                    reasoning=reasoning,
+                )
+
+        if self.verbose:
+            logger.debug(f"{response=}")
+
         return validated_answer, call_dict
\ No newline at end of file
diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py
index f7ca4d1..b813541 100644
--- a/src/occupational_classification_utils/llm/prompt.py
+++ b/src/occupational_classification_utils/llm/prompt.py
@@ -31,6 +31,7 @@
     RagResponse,
     SocResponse,
     UnambiguousResponse,
+    OpenFollowUp,
 )
 
 config = get_config()
@@ -214,3 +215,70 @@
 {format_instructions}
 """
 )
+
+
+_open_follow_up = """"You are an expert survey methodologist specialising in
+    UK industrial classification (UK SOC). Generate one open-ended follow-up question
+    to help assign the most relevant UK SOC code.
+
+Objective
+- Produce exactly one question that elicits the key information needed to distinguish
+    between the shortlisted SOC candidates, focusing on the employer's main business activity.
+
+Inputs
+- Respondent data:
+- Company's main activity: {industry_descr}
+- Job title: {job_title}
+- Job description: {job_description}
+- Level of Education: {level_of_education}
+- Shortlist from previous model: {llm_output}
+- Note: These are candidate SOC categories; do not mention codes or "SOC" to the respondent.
+
+How to decide what to ask
+- Identify the smallest, most informative difference among the candidates and target that with a single question.
+- Prioritise discriminators in this order:
+1) Stage in the value chain (e.g., manufacture/processing vs wholesale vs retail vs repair/installation vs
+    rental/leasing vs publishing/software vs consultancy/training).
+2) Main product or service category (what goods/services the employer mainly provides).
+3) Main customer type (households vs businesses vs government/health/education).
+4) Delivery mode or setting (on-site vs online; physical goods vs digital; own-brand vs third-party).
+- Ask about only one discriminator—the one most likely to resolve the ambiguity.
+
+Quality standards
+- Language and clarity:
+    - Use plain British English; avoid or define jargon and abbreviations.
+    - Keep the single question concise (max 25 words), grammatically correct, and neutral.
+    - Use "employer" for for-profit; use "organisation" for non-profits, charities, public bodies, and education.
+        Default to "employer", if ambiguous.
+    - Refer to the present situation (e.g., "currently", "main").
+    - Do not mention SOC or any code numbers.
+    - Do not ask for company names, client names, or other personal/sensitive data.
+- Question structure:
+    - Start with "What", "How", "Which", or "Where".
+    - Focus on the employer's main business activities, products, or services—not the respondent's personal tasks.
+    - One issue per question; no A/B or either/or phrasing; avoid binary questions.
+    - Limit to one sentence ending with a question mark.
+    - You may add one additional sentence with broad, non-leading examples covering a wide range of options;
+        omit examples if they would be leading.
+- Respondent considerations:
+    - Make it easy to answer in a few words.
+    - Ask only what a typical employee would reasonably know.
+    - Avoid requiring calculations or percentages.
+
+Edge cases
+- If the shortlist is empty or clearly points to one category, ask a general clarifying question about
+    the main product/service or value-chain stage to confirm classification.
+- Do not output explanations or reasoning; only the formatted result.
+
+Output format
+- Return output that strictly follows:
+{format_instructions}
+"""
+parser_followup_open = PydanticOutputParser(pydantic_object=OpenFollowUp)
+
+SOC_PROMPT_OPENFOLLOWUP = PromptTemplate.from_template(
+    template=_core_prompt + _open_follow_up,
+    partial_variables={
+        "format_instructions": parser_followup_open.get_format_instructions(),
+    },
+)
diff --git a/src/occupational_classification_utils/models/response_model.py b/src/occupational_classification_utils/models/response_model.py
index 994f765..fbb48a7 100644
--- a/src/occupational_classification_utils/models/response_model.py
+++ b/src/occupational_classification_utils/models/response_model.py
@@ -338,4 +338,26 @@ def validate_alt_candidates(cls, v):
         """
         if not 1 <= len(v) <= MAX_ALT_CANDIDATES:
             raise ValueError("alt_candidates must contain between 1 and 10 items.")
-        return v
\ No newline at end of file
+        return v
+
+
+class OpenFollowUp(BaseModel):
+    """Represents a response model for open ended follow-up question.
+
+    Attributes:
+        followup (str): Question to ask user in order to collect
+            additional information to enable reliable classification assignment.
+        reasoning (str): Reasoning explaining how follow-up question will help
+            assign classification code.
+    """
+
+    followup: str | None = Field(
+        description="""Question to ask user in order to collect additional information
+        to enable reliable classification assignment.""",
+        default="",
+    )
+    reasoning: str = Field(
+        description="""Reasoning explaining how follow-up question will help
+            assign classification code.""",
+        default="",
+    )
\ No newline at end of file

From 0a5b90736dd955b9dd3c95662cf92b29818bcd99 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Fri, 29 May 2026 13:15:59 +0000
Subject: [PATCH 04/24] reflect types as in ruff

---
 .../llm/llm.py                                | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py
index 2025fa3..3fdbb02 100644
--- a/src/occupational_classification_utils/llm/llm.py
+++ b/src/occupational_classification_utils/llm/llm.py
@@ -18,7 +18,7 @@
 import time
 from collections import defaultdict
 from functools import lru_cache
-from typing import Any, Optional, Union
+from typing import Any
 
 import numpy as np
 from langchain.output_parsers import PydanticOutputParser
@@ -75,11 +75,11 @@ class ClassificationLLM:
     def __init__(  # noqa: PLR0913
         self,
         model_name: str = DEFAULT_LLM_MODEL,
-        llm: Optional[Union[ChatVertexAI, ChatOpenAI]] = None,
+        llm: ChatVertexAI | ChatOpenAI | None = None,
         max_tokens: int = 1600,
         temperature: float = 0.0,
         verbose: bool = True,
-        openai_api_key: Optional[SecretStr] = None,
+        openai_api_key: SecretStr | None = None,
     ):
         """Initialises the ClassificationLLM object."""
         logger.info(
@@ -113,7 +113,7 @@ def __init__(  # noqa: PLR0913
         self.sa_soc_prompt_rag = SA_SOC_PROMPT_RAG
         self.soc_prompt_unambiguous = SOC_PROMPT_UNAMBIGUOUS
         self.soc_prompt_openfollowup = SOC_PROMPT_OPENFOLLOWUP
-        self.soc: Optional[SOC] = None
+        self.soc: SOC | None = None
         self.verbose = verbose
 
     @lru_cache  # noqa: B019
@@ -207,7 +207,7 @@ def _prompt_candidate(
 
     def _prompt_candidate_list(
         self,
-        short_list: Union[list[dict], list[tuple[Document, float]]],  # list[dict],
+        short_list: list[dict] | list[tuple[Document, float]],  # list[dict],
         chars_limit: int = 14000,
         candidates_limit: int = 5,
         titles_limit: int = 3,
@@ -269,13 +269,13 @@ def _prompt_candidate_list(
     async def sa_rag_soc_code(  # noqa: PLR0913
         self,
         industry_descr: str,
-        job_title: Optional[str] = None,
-        job_description: Optional[str] = None,
+        job_title: str | None = None,
+        job_description: str | None = None,
         expand_search_terms: bool = True,
         code_digits: int = 4,
         candidates_limit: int = 5,
-        short_list: Optional[list[dict[Any, Any]]] = None,
-    ) -> tuple[SocResponse, Optional[list[dict[Any, Any]]], Optional[Any]]:
+        short_list: list[dict[Any, Any]] | None = None,
+    ) -> tuple[SocResponse, list[dict[Any, Any]] | None, Any | None]:
         """Generates a SOC classification based on respondent's data using RAG approach.
 
         Caller must provide short_list (e.g. from vector store API). Mirrors
@@ -414,13 +414,13 @@ async def unambiguous_soc_code(  # noqa: PLR0913
         self,
         industry_descr: str,
         semantic_search_results: list[dict],
-        job_title: Optional[str] = None,
-        job_description: Optional[str] = None,
-        level_of_education: Optional[str] = None,
+        job_title: str | None = None,
+        job_description: str | None = None,
+        level_of_education: str | None = None,
         candidates_limit: int = 5,
         code_digits: int = 4,
-        correlation_id: Optional[str] = None,
-    ) -> tuple[UnambiguousResponse, Optional[Any]]:
+        correlation_id: str | None = None,
+    ) -> tuple[UnambiguousResponse, Any | None]:
         """Evaluates codability to a single 4-digit SOC code based on respondent's data.
 
         Args:

From 7d76e93b6929836b78999505f0508418fa79ef88 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Fri, 29 May 2026 15:05:22 +0000
Subject: [PATCH 05/24] add tests to meet 80% coverage

---
 tests/test_llm.py | 63 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 2 deletions(-)

diff --git a/tests/test_llm.py b/tests/test_llm.py
index ae2c533..d3e0205 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -16,11 +16,18 @@
 from occupational_classification.data_access.soc_data_access import (
     load_soc_structure as lib_load_soc_structure,
 )
-from occupational_classification.hierarchy.soc_hierarchy import load_hierarchy
+from occupational_classification.hierarchy.soc_hierarchy import (
+    SOC,
+    SocNode,
+    load_hierarchy,
+)
 
 from occupational_classification_utils.llm.llm import ClassificationLLM
 from occupational_classification_utils.llm.prompt import SA_SOC_PROMPT_RAG
-from occupational_classification_utils.models.response_model import SocResponse
+from occupational_classification_utils.models.response_model import (
+    OpenFollowUp,
+    SocResponse,
+)
 
 MODEL_NAME = "gemini-2.5-flash"
 LOCATION = "europe-west2"
@@ -78,6 +85,32 @@ def mock_vertex_ai():
         yield
 
 
+@pytest.fixture
+def prompt_candidate_soc():
+    nodes = [
+        SocNode(
+            soc_code="1234",
+            group_title="grouptitle1234",
+            group_description="description12345",
+        ),
+        SocNode(
+            soc_code="2345",
+            group_title="grouptitle2345",
+            group_description="description2345",
+        ),
+    ]
+    lookup = {}
+    for node in nodes:
+        lookup[str(node.soc_code)] = node
+
+    print("LOOKUP", lookup)
+
+    soc = SOC(nodes=nodes, lookup=lookup)
+    llm_class = ClassificationLLM(model_name=MODEL_NAME)
+    llm_class.soc = soc
+    return llm_class
+
+
 @pytest.mark.parametrize(
     "model, openai_api_key, expected_model",
     [
@@ -324,3 +357,29 @@ async def test_sa_rag_soc_code_short_list_is_none_raise_value_error(
             job_description="teach children",
             short_list=None,
         )
+
+
+@pytest.mark.llm
+async def test_llm_response_mocked_formulate_open_question(
+    mocker, prompt_candidate_soc
+):
+    mock_object_dict = {"class_code": "", "class_descriptive": "", "likelihood": 0.5}
+    mock_object_json = json.dumps(mock_object_dict)
+
+    mock_message = mocker.Mock(spec=AIMessage)
+    mock_message.content = mock_object_json
+
+    mock_patcher = mocker.patch(  # noqa: F841
+        "occupational_classification_utils.llm.llm.ChatVertexAI.ainvoke",
+        return_value=mock_message,
+    )
+
+    result = await prompt_candidate_soc.formulate_open_question(
+        industry_descr="",
+        job_title="",
+        job_description="",
+        level_of_education="",
+        llm_output="",
+    )
+    assert isinstance(result[0], OpenFollowUp)
+    assert isinstance(result[1], dict)

From 40f0926256b54c4cd079a5e9d7ffc0c18682d3e5 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Tue, 2 Jun 2026 13:37:46 +0000
Subject: [PATCH 06/24] reorder llm.py to reflect changes made in main

---
 .../llm/llm.py                                | 264 +++++++++---------
 1 file changed, 132 insertions(+), 132 deletions(-)

diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py
index d944d7e..1b9d702 100644
--- a/src/occupational_classification_utils/llm/llm.py
+++ b/src/occupational_classification_utils/llm/llm.py
@@ -391,42 +391,38 @@ async def unambiguous_soc_code(  # noqa: PLR0913
 
         return validated_answer, call_dict
 
-    async def sa_rag_soc_code(  # noqa: PLR0913
+    async def formulate_open_question(  # noqa: PLR0913
         self,
         industry_descr: str,
         job_title: str | None = None,
         job_description: str | None = None,
-        code_digits: int = config["llm"]["code_digits"],
-        candidates_limit: int = config["llm"]["candidates_limit"],
-        short_list: list[dict[Any, Any]] | None = None,
-    ) -> tuple[SocResponse, list[dict[Any, Any]] | None, Any | None]:
-        """Generates a SOC classification based on respondent's data using RAG approach.
-
-        Caller must provide short_list (e.g. from vector store API). Mirrors
-        sic-classification-utils ``sa_rag_sic_code`` (raises when short_list is None).
+        level_of_education: str | None = None,
+        llm_output: SocCandidate | None = None,
+        correlation_id: str | None = None,
+    ) -> tuple[OpenFollowUp, Any]:
+        """Formulates an open-ended question using respondent data and survey design guidelines.
 
         Args:
             industry_descr (str): The description of the industry.
             job_title (str, optional): The job title. Defaults to None.
             job_description (str, optional): The job description. Defaults to None.
-            code_digits (int, optional): The number of digits in the generated
-                SOC code. Defaults to 4.
-            candidates_limit (int, optional): The maximum number of SOC code candidates
-                to consider. Defaults to 5.
-            short_list (list[dict[Any, Any]], optional): A list of results from
-                embedding or vector store search (e.g. from soc-classification-vector-store).
-                Each dict should have "code" and "title" keys.
+            level_of_education (str, optional): The level od education. Defaults to None.
+            llm_output (SocCandidate, optional): The response from the LLM model.
+            correlation_id (str, optional): Optional correlation ID for request tracking.
 
         Returns:
-            SocResponse: The generated response to the query.
+            OpenFollowUp: The generated response to the query.
 
         Raises:
             ValueError: If there is an error during the parsing of the response.
-            ValueError: If short_list is None.
+            ValueError: If the default embedding handler is required but
+                not loaded correctly.
 
         """
 
-        def prep_call_dict(industry_descr, job_title, job_description, soc_codes):
+        def prep_call_dict(
+            industry_descr, job_title, job_description, level_of_education, llm_output
+        ):
             # Helper function to prepare the call dictionary
             is_job_title_present = job_title is None or job_title in {"", " "}
             job_title = "Unknown" if is_job_title_present else job_title
@@ -438,63 +434,95 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes):
             job_description = (
                 "Unknown" if is_job_description_present else job_description
             )
+            level_of_education = (
+                "Unknown"
+                if (level_of_education is None or level_of_education in {"", " "})
+                else level_of_education
+            )
 
             call_dict = {
                 "industry_descr": industry_descr,
                 "job_title": job_title,
                 "job_description": job_description,
-                "soc_index": soc_codes,
+                "level_of_education": level_of_education,
+                "llm_output": str(llm_output),
             }
             return call_dict
 
-        if short_list is None:
-            raise ValueError(
-                "Short list is None - list provided from embedding search."
-            )
-
-        soc_codes = self._prompt_candidate_list(
-            short_list, code_digits=code_digits, candidates_limit=candidates_limit
-        )
-
         call_dict = prep_call_dict(
             industry_descr=industry_descr,
             job_title=job_title,
             job_description=job_description,
-            soc_codes=soc_codes,
+            level_of_education=level_of_education,
+            llm_output=llm_output,
         )
 
         if self.verbose:
-            final_prompt = self.sa_soc_prompt_rag.format(**call_dict)
-            logger.debug(f"Final prompt: {final_prompt}")
+            final_prompt = self.soc_prompt_openfollowup.format(**call_dict)
+            logger.debug(final_prompt)
 
-        chain = self.sa_soc_prompt_rag | self.llm
+        chain = self.soc_prompt_openfollowup | self.llm
+
+        # Log LLM request sent # Not logging yet - needs to create/import truncate_identifier.
+        # logger.info(
+        #     "LLM request sent - formulate_open_question",
+        #     job_title=truncate_identifier(job_title),
+        #     job_description=truncate_identifier(job_description),
+        #     level_of_education=truncate_identifier(level_of_education),
+        #     industry_descr=truncate_identifier(industry_descr),
+        #     correlation_id=correlation_id or "",
+        # )
+        llm_start = time.perf_counter()
 
         try:
             response = await chain.ainvoke(call_dict, return_only_outputs=True)
-        except ValueError as err:
-            logger.error(f"Error from chain, exit early: {err}", error=str(err))
-            validated_answer = SocResponse(
-                followup="Follow-up question not available due to error.",
-                reasoning="Error from chain, exit early",
+        except (ValueError, AttributeError) as err:
+            logger.error(
+                f"Error from LLMChain, exit early: {err}",
+                error=str(err),
+                correlation_id=correlation_id or "",
             )
-            return validated_answer, short_list, call_dict
+            logger.warning(
+                "Error from LLMChain, exit early",
+                correlation_id=correlation_id or "",
+            )
+            validated_answer = OpenFollowUp(
+                followup=None,
+                reasoning="Error from LLMChain, exit early",
+            )
+            return validated_answer, call_dict
 
-        if self.verbose:
-            logger.debug(f"LLM response: {response}")
+        llm_duration_ms = int((time.perf_counter() - llm_start) * 1000)
 
-        parser = PydanticOutputParser(  # type: ignore # Suspect langchain ver bug
-            pydantic_object=SocResponse,
-        )
+        # Parse the output to the desired format
+        parser = PydanticOutputParser(pydantic_object=OpenFollowUp)
         try:
             validated_answer = parser.parse(str(response.content))
+            # Log LLM response received after successful parse
+            has_followup = bool(getattr(validated_answer, "followup", None))
+            logger.info(
+                "LLM response received for open question prompt",
+                has_followup=str(has_followup),
+                duration_ms=str(llm_duration_ms),
+                correlation_id=correlation_id or "",
+            )
         except (ValueError, AttributeError) as parse_error:
             logger.error(
-                f"Failed to parse response: {parse_error}", error=str(parse_error)
+                f"Failed to parse response: {parse_error}",
+                error=str(parse_error),
+                correlation_id=correlation_id or "",
             )
             logger.warning(
-                "Failed to parse response", response_content=str(response.content)
+                "Failed to parse response",
+                response_content=str(response.content),
+                correlation_id=correlation_id or "",
+            )
+            logger.info(
+                "LLM response received for open question prompt",
+                has_followup="False",
+                duration_ms=str(llm_duration_ms),
+                correlation_id=correlation_id or "",
             )
-
             try:
                 chain = FIX_PARSING_PROMPT | self.llm
                 response = await chain.ainvoke(
@@ -506,6 +534,7 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes):
                 )
                 validated_answer = parser.parse(str(response.content))
                 logger.debug("Successfully parsed reformatted response.")
+
             except (ValueError, AttributeError) as parse_error2:
                 logger.error(
                     f"Failed to parse response again: {parse_error2}",
@@ -518,45 +547,52 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes):
                 reasoning = (
                     f"ERROR parse_error=<{parse_error2}>, response=<{response.content}>"
                 )
-                validated_answer = SocResponse(
-                    followup="Follow-up question not available due to error.",
+                validated_answer = OpenFollowUp(
+                    followup=None,
                     reasoning=reasoning,
                 )
 
-        return validated_answer, short_list, call_dict
+        if self.verbose:
+            logger.debug(f"{response=}")
 
-    async def formulate_open_question(  # noqa: PLR0913
+        return validated_answer, call_dict
+
+    async def sa_rag_soc_code(  # noqa: PLR0913
         self,
         industry_descr: str,
         job_title: str | None = None,
         job_description: str | None = None,
-        level_of_education: str | None = None,
-        llm_output: SocCandidate | None = None,
-        correlation_id: str | None = None,
-    ) -> tuple[OpenFollowUp, Any]:
-        """Formulates an open-ended question using respondent data and survey design guidelines.
+        code_digits: int = config["llm"]["code_digits"],
+        candidates_limit: int = config["llm"]["candidates_limit"],
+        short_list: list[dict[Any, Any]] | None = None,
+    ) -> tuple[SocResponse, list[dict[Any, Any]] | None, Any | None]:
+        """Generates a SOC classification based on respondent's data using RAG approach.
+
+        Caller must provide short_list (e.g. from vector store API). Mirrors
+        sic-classification-utils ``sa_rag_sic_code`` (raises when short_list is None).
 
         Args:
             industry_descr (str): The description of the industry.
             job_title (str, optional): The job title. Defaults to None.
             job_description (str, optional): The job description. Defaults to None.
-            level_of_education (str, optional): The level od education. Defaults to None.
-            llm_output (SocCandidate, optional): The response from the LLM model.
-            correlation_id (str, optional): Optional correlation ID for request tracking.
+            code_digits (int, optional): The number of digits in the generated
+                SOC code. Defaults to 4.
+            candidates_limit (int, optional): The maximum number of SOC code candidates
+                to consider. Defaults to 5.
+            short_list (list[dict[Any, Any]], optional): A list of results from
+                embedding or vector store search (e.g. from soc-classification-vector-store).
+                Each dict should have "code" and "title" keys.
 
         Returns:
-            OpenFollowUp: The generated response to the query.
+            SocResponse: The generated response to the query.
 
         Raises:
             ValueError: If there is an error during the parsing of the response.
-            ValueError: If the default embedding handler is required but
-                not loaded correctly.
+            ValueError: If short_list is None.
 
         """
 
-        def prep_call_dict(
-            industry_descr, job_title, job_description, level_of_education, llm_output
-        ):
+        def prep_call_dict(industry_descr, job_title, job_description, soc_codes):
             # Helper function to prepare the call dictionary
             is_job_title_present = job_title is None or job_title in {"", " "}
             job_title = "Unknown" if is_job_title_present else job_title
@@ -568,95 +604,63 @@ def prep_call_dict(
             job_description = (
                 "Unknown" if is_job_description_present else job_description
             )
-            level_of_education = (
-                "Unknown"
-                if (level_of_education is None or level_of_education in {"", " "})
-                else level_of_education
-            )
 
             call_dict = {
                 "industry_descr": industry_descr,
                 "job_title": job_title,
                 "job_description": job_description,
-                "level_of_education": level_of_education,
-                "llm_output": str(llm_output),
+                "soc_index": soc_codes,
             }
             return call_dict
 
+        if short_list is None:
+            raise ValueError(
+                "Short list is None - list provided from embedding search."
+            )
+
+        soc_codes = self._prompt_candidate_list(
+            short_list, code_digits=code_digits, candidates_limit=candidates_limit
+        )
+
         call_dict = prep_call_dict(
             industry_descr=industry_descr,
             job_title=job_title,
             job_description=job_description,
-            level_of_education=level_of_education,
-            llm_output=llm_output,
+            soc_codes=soc_codes,
         )
 
         if self.verbose:
-            final_prompt = self.soc_prompt_openfollowup.format(**call_dict)
-            logger.debug(final_prompt)
-
-        chain = self.soc_prompt_openfollowup | self.llm
+            final_prompt = self.sa_soc_prompt_rag.format(**call_dict)
+            logger.debug(f"Final prompt: {final_prompt}")
 
-        # Log LLM request sent # Not logging yet - needs to create/import truncate_identifier.
-        # logger.info(
-        #     "LLM request sent - formulate_open_question",
-        #     job_title=truncate_identifier(job_title),
-        #     job_description=truncate_identifier(job_description),
-        #     level_of_education=truncate_identifier(level_of_education),
-        #     industry_descr=truncate_identifier(industry_descr),
-        #     correlation_id=correlation_id or "",
-        # )
-        llm_start = time.perf_counter()
+        chain = self.sa_soc_prompt_rag | self.llm
 
         try:
             response = await chain.ainvoke(call_dict, return_only_outputs=True)
-        except (ValueError, AttributeError) as err:
-            logger.error(
-                f"Error from LLMChain, exit early: {err}",
-                error=str(err),
-                correlation_id=correlation_id or "",
-            )
-            logger.warning(
-                "Error from LLMChain, exit early",
-                correlation_id=correlation_id or "",
-            )
-            validated_answer = OpenFollowUp(
-                followup=None,
-                reasoning="Error from LLMChain, exit early",
+        except ValueError as err:
+            logger.error(f"Error from chain, exit early: {err}", error=str(err))
+            validated_answer = SocResponse(
+                followup="Follow-up question not available due to error.",
+                reasoning="Error from chain, exit early",
             )
-            return validated_answer, call_dict
+            return validated_answer, short_list, call_dict
 
-        llm_duration_ms = int((time.perf_counter() - llm_start) * 1000)
+        if self.verbose:
+            logger.debug(f"LLM response: {response}")
 
-        # Parse the output to the desired format
-        parser = PydanticOutputParser(pydantic_object=OpenFollowUp)
+        parser = PydanticOutputParser(  # type: ignore # Suspect langchain ver bug
+            pydantic_object=SocResponse,
+        )
         try:
             validated_answer = parser.parse(str(response.content))
-            # Log LLM response received after successful parse
-            has_followup = bool(getattr(validated_answer, "followup", None))
-            logger.info(
-                "LLM response received for open question prompt",
-                has_followup=str(has_followup),
-                duration_ms=str(llm_duration_ms),
-                correlation_id=correlation_id or "",
-            )
         except (ValueError, AttributeError) as parse_error:
             logger.error(
-                f"Failed to parse response: {parse_error}",
-                error=str(parse_error),
-                correlation_id=correlation_id or "",
+                f"Failed to parse response: {parse_error}", error=str(parse_error)
             )
             logger.warning(
-                "Failed to parse response",
-                response_content=str(response.content),
-                correlation_id=correlation_id or "",
-            )
-            logger.info(
-                "LLM response received for open question prompt",
-                has_followup="False",
-                duration_ms=str(llm_duration_ms),
-                correlation_id=correlation_id or "",
+                "Failed to parse response", response_content=str(response.content)
             )
+
             try:
                 chain = FIX_PARSING_PROMPT | self.llm
                 response = await chain.ainvoke(
@@ -668,7 +672,6 @@ def prep_call_dict(
                 )
                 validated_answer = parser.parse(str(response.content))
                 logger.debug("Successfully parsed reformatted response.")
-
             except (ValueError, AttributeError) as parse_error2:
                 logger.error(
                     f"Failed to parse response again: {parse_error2}",
@@ -681,12 +684,9 @@ def prep_call_dict(
                 reasoning = (
                     f"ERROR parse_error=<{parse_error2}>, response=<{response.content}>"
                 )
-                validated_answer = OpenFollowUp(
-                    followup=None,
+                validated_answer = SocResponse(
+                    followup="Follow-up question not available due to error.",
                     reasoning=reasoning,
                 )
 
-        if self.verbose:
-            logger.debug(f"{response=}")
-
-        return validated_answer, call_dict
+        return validated_answer, short_list, call_dict

From b419c6c9f854313b6d940ccbb61d29a4bdfd018c Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Tue, 2 Jun 2026 14:19:31 +0000
Subject: [PATCH 07/24] Use more general validation method

---
 src/occupational_classification_utils/llm/llm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py
index 1b9d702..5361008 100644
--- a/src/occupational_classification_utils/llm/llm.py
+++ b/src/occupational_classification_utils/llm/llm.py
@@ -42,7 +42,7 @@
 )
 from occupational_classification_utils.models.response_model import (
     OpenFollowUp,
-    SocCandidate,
+    RagCandidate,
     SocResponse,
     UnambiguousResponse,
 )
@@ -397,7 +397,7 @@ async def formulate_open_question(  # noqa: PLR0913
         job_title: str | None = None,
         job_description: str | None = None,
         level_of_education: str | None = None,
-        llm_output: SocCandidate | None = None,
+        llm_output: RagCandidate | None = None,
         correlation_id: str | None = None,
     ) -> tuple[OpenFollowUp, Any]:
         """Formulates an open-ended question using respondent data and survey design guidelines.
@@ -407,7 +407,7 @@ async def formulate_open_question(  # noqa: PLR0913
             job_title (str, optional): The job title. Defaults to None.
             job_description (str, optional): The job description. Defaults to None.
             level_of_education (str, optional): The level od education. Defaults to None.
-            llm_output (SocCandidate, optional): The response from the LLM model.
+            llm_output (RagCandidate, optional): The response from the LLM model.
             correlation_id (str, optional): Optional correlation ID for request tracking.
 
         Returns:

From a9b7c45d09bdc997623550f33b406fe8bc01ba10 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Tue, 2 Jun 2026 14:24:26 +0000
Subject: [PATCH 08/24] remove duplications in prompt.py

---
 .../llm/prompt.py                             | 67 -------------------
 1 file changed, 67 deletions(-)

diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py
index 43ed998..78be492 100644
--- a/src/occupational_classification_utils/llm/prompt.py
+++ b/src/occupational_classification_utils/llm/prompt.py
@@ -280,70 +280,3 @@
         "format_instructions": parser_followup_open.get_format_instructions(),
     },
 )
-
-
-_open_follow_up = """"You are an expert survey methodologist specialising in
-    UK industrial classification (UK SOC). Generate one open-ended follow-up question
-    to help assign the most relevant UK SOC code.
-
-Objective
-- Produce exactly one question that elicits the key information needed to distinguish
-    between the shortlisted SOC candidates, focusing on the employer's main business activity.
-
-Inputs
-- Respondent data:
-- Company's main activity: {industry_descr}
-- Job title: {job_title}
-- Job description: {job_description}
-- Level of Education: {level_of_education}
-- Shortlist from previous model: {llm_output}
-- Note: These are candidate SOC categories; do not mention codes or "SOC" to the respondent.
-
-How to decide what to ask
-- Identify the smallest, most informative difference among the candidates and target that with a single question.
-- Prioritise discriminators in this order:
-1) Stage in the value chain (e.g., manufacture/processing vs wholesale vs retail vs repair/installation vs
-    rental/leasing vs publishing/software vs consultancy/training).
-2) Main product or service category (what goods/services the employer mainly provides).
-3) Main customer type (households vs businesses vs government/health/education).
-4) Delivery mode or setting (on-site vs online; physical goods vs digital; own-brand vs third-party).
-- Ask about only one discriminator—the one most likely to resolve the ambiguity.
-
-Quality standards
-- Language and clarity:
-    - Use plain British English; avoid or define jargon and abbreviations.
-    - Keep the single question concise (max 25 words), grammatically correct, and neutral.
-    - Use "employer" for for-profit; use "organisation" for non-profits, charities, public bodies, and education.
-        Default to "employer", if ambiguous.
-    - Refer to the present situation (e.g., "currently", "main").
-    - Do not mention SOC or any code numbers.
-    - Do not ask for company names, client names, or other personal/sensitive data.
-- Question structure:
-    - Start with "What", "How", "Which", or "Where".
-    - Focus on the employer's main business activities, products, or services—not the respondent's personal tasks.
-    - One issue per question; no A/B or either/or phrasing; avoid binary questions.
-    - Limit to one sentence ending with a question mark.
-    - You may add one additional sentence with broad, non-leading examples covering a wide range of options;
-        omit examples if they would be leading.
-- Respondent considerations:
-    - Make it easy to answer in a few words.
-    - Ask only what a typical employee would reasonably know.
-    - Avoid requiring calculations or percentages.
-
-Edge cases
-- If the shortlist is empty or clearly points to one category, ask a general clarifying question about
-    the main product/service or value-chain stage to confirm classification.
-- Do not output explanations or reasoning; only the formatted result.
-
-Output format
-- Return output that strictly follows:
-{format_instructions}
-"""
-parser_followup_open = PydanticOutputParser(pydantic_object=OpenFollowUp)
-
-SOC_PROMPT_OPENFOLLOWUP = PromptTemplate.from_template(
-    template=_core_prompt + _open_follow_up,
-    partial_variables={
-        "format_instructions": parser_followup_open.get_format_instructions(),
-    },
-)

From 7f7730695b8c23151c6d8035309c1a1836bc098d Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Tue, 2 Jun 2026 14:28:21 +0000
Subject: [PATCH 09/24] uncomment code in llm.py

---
 .../llm/llm.py                                 | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py
index 5361008..de7ad71 100644
--- a/src/occupational_classification_utils/llm/llm.py
+++ b/src/occupational_classification_utils/llm/llm.py
@@ -463,15 +463,15 @@ def prep_call_dict(
 
         chain = self.soc_prompt_openfollowup | self.llm
 
-        # Log LLM request sent # Not logging yet - needs to create/import truncate_identifier.
-        # logger.info(
-        #     "LLM request sent - formulate_open_question",
-        #     job_title=truncate_identifier(job_title),
-        #     job_description=truncate_identifier(job_description),
-        #     level_of_education=truncate_identifier(level_of_education),
-        #     industry_descr=truncate_identifier(industry_descr),
-        #     correlation_id=correlation_id or "",
-        # )
+        # Log LLM request sent
+        logger.info(
+            "LLM request sent - formulate_open_question",
+            job_title=truncate_identifier(job_title),
+            job_description=truncate_identifier(job_description),
+            level_of_education=truncate_identifier(level_of_education),
+            industry_descr=truncate_identifier(industry_descr),
+            correlation_id=correlation_id or "",
+        )
         llm_start = time.perf_counter()
 
         try:

From 0fbf380b6020a2a205a6ba643efdb19125a06e08 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Tue, 2 Jun 2026 15:16:15 +0000
Subject: [PATCH 10/24] correct validation method

---
 .../models/response_model.py                                  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/occupational_classification_utils/models/response_model.py b/src/occupational_classification_utils/models/response_model.py
index a7624d8..768b56d 100644
--- a/src/occupational_classification_utils/models/response_model.py
+++ b/src/occupational_classification_utils/models/response_model.py
@@ -17,7 +17,7 @@
     MAX_ALT_CANDIDATES: Maximum number of alternative candidates allowed in certain models.
 """
 
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Field, field_validator, model_validator
 
 from occupational_classification_utils.utils.constants import MAX_ALT_CANDIDATES
 
@@ -323,6 +323,8 @@ class UnambiguousResponse(BaseModel):
         min_length=50,  # Ensure detailed reasoning is provided
     )
 
+    @field_validator("alt_candidates")
+    @classmethod
     def validate_alt_candidates(cls, v):
         """Validates the number of alternative candidates.
 

From 9117aa1da55597aa26c4e2aa7e92e27f7da918cf Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Wed, 3 Jun 2026 14:24:58 +0000
Subject: [PATCH 11/24] hash level_of_education

---
 .../llm/llm.py                                | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py
index de7ad71..bbc8deb 100644
--- a/src/occupational_classification_utils/llm/llm.py
+++ b/src/occupational_classification_utils/llm/llm.py
@@ -122,7 +122,7 @@ async def get_soc_code(
         self,
         job_title: str,
         job_description: str,
-        level_of_education: str,
+        # level_of_education: str,
         manage_others: bool,
         industry_descr: str,
     ) -> SocResponse:
@@ -148,7 +148,7 @@ async def get_soc_code(
             {
                 "job_title": job_title,
                 "job_description": job_description,
-                "level_of_education": level_of_education,
+                # "level_of_education": level_of_education,
                 "manage_others": manage_others,
                 "industry_descr": industry_descr,
             },
@@ -391,12 +391,12 @@ async def unambiguous_soc_code(  # noqa: PLR0913
 
         return validated_answer, call_dict
 
-    async def formulate_open_question(  # noqa: PLR0913
+    async def formulate_open_question(
         self,
         industry_descr: str,
         job_title: str | None = None,
         job_description: str | None = None,
-        level_of_education: str | None = None,
+        # level_of_education: str | None = None,
         llm_output: RagCandidate | None = None,
         correlation_id: str | None = None,
     ) -> tuple[OpenFollowUp, Any]:
@@ -421,7 +421,11 @@ async def formulate_open_question(  # noqa: PLR0913
         """
 
         def prep_call_dict(
-            industry_descr, job_title, job_description, level_of_education, llm_output
+            industry_descr,
+            job_title,
+            job_description,
+            # level_of_education,
+            llm_output,
         ):
             # Helper function to prepare the call dictionary
             is_job_title_present = job_title is None or job_title in {"", " "}
@@ -434,17 +438,17 @@ def prep_call_dict(
             job_description = (
                 "Unknown" if is_job_description_present else job_description
             )
-            level_of_education = (
-                "Unknown"
-                if (level_of_education is None or level_of_education in {"", " "})
-                else level_of_education
-            )
+            # level_of_education = (
+            #     "Unknown"
+            #     if (level_of_education is None or level_of_education in {"", " "})
+            #     else level_of_education
+            # )
 
             call_dict = {
                 "industry_descr": industry_descr,
                 "job_title": job_title,
                 "job_description": job_description,
-                "level_of_education": level_of_education,
+                # "level_of_education": level_of_education,
                 "llm_output": str(llm_output),
             }
             return call_dict
@@ -453,7 +457,7 @@ def prep_call_dict(
             industry_descr=industry_descr,
             job_title=job_title,
             job_description=job_description,
-            level_of_education=level_of_education,
+            # level_of_education=level_of_education,
             llm_output=llm_output,
         )
 
@@ -468,7 +472,7 @@ def prep_call_dict(
             "LLM request sent - formulate_open_question",
             job_title=truncate_identifier(job_title),
             job_description=truncate_identifier(job_description),
-            level_of_education=truncate_identifier(level_of_education),
+            # level_of_education=truncate_identifier(level_of_education),
             industry_descr=truncate_identifier(industry_descr),
             correlation_id=correlation_id or "",
         )

From 0788eb936cbc61300f0206ac528c97c4b6c1ceef Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Wed, 3 Jun 2026 14:44:38 +0000
Subject: [PATCH 12/24] remove level of education from test and prompt

---
 src/occupational_classification_utils/llm/prompt.py | 1 -
 tests/test_llm.py                                   | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py
index 78be492..ee92fb4 100644
--- a/src/occupational_classification_utils/llm/prompt.py
+++ b/src/occupational_classification_utils/llm/prompt.py
@@ -50,7 +50,6 @@
 ===Respondent Data===
 - Job Title: {job_title}
 - Job Description: {job_description}
-- Level of Education: {level_of_education}
 - Line Management Responsibilities: {manage_others}
 - Company's main activity: {industry_descr}
 
diff --git a/tests/test_llm.py b/tests/test_llm.py
index ba91dd5..5bf56ef 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -246,7 +246,7 @@ async def test_llm_response_mocked_get_soc_code():
         result = await ClassificationLLM(model_name=MODEL_NAME).get_soc_code(
             job_title="teacher",
             job_description="teach children",
-            level_of_education="degree",
+            # level_of_education="degree",
             manage_others=False,
             industry_descr="school",
         )
@@ -457,7 +457,7 @@ async def test_llm_response_mocked_formulate_open_question(
         industry_descr="",
         job_title="",
         job_description="",
-        level_of_education="",
+        # level_of_education="",
         llm_output="",
     )
     assert isinstance(result[0], OpenFollowUp)

From 0179a004e17fbdc18dba11ac3a7bdd8a7cc8754d Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Wed, 3 Jun 2026 15:29:43 +0000
Subject: [PATCH 13/24] Revert "remove level of education from test and prompt"

This reverts commit 0788eb936cbc61300f0206ac528c97c4b6c1ceef.
---
 src/occupational_classification_utils/llm/prompt.py | 1 +
 tests/test_llm.py                                   | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py
index ee92fb4..78be492 100644
--- a/src/occupational_classification_utils/llm/prompt.py
+++ b/src/occupational_classification_utils/llm/prompt.py
@@ -50,6 +50,7 @@
 ===Respondent Data===
 - Job Title: {job_title}
 - Job Description: {job_description}
+- Level of Education: {level_of_education}
 - Line Management Responsibilities: {manage_others}
 - Company's main activity: {industry_descr}
 
diff --git a/tests/test_llm.py b/tests/test_llm.py
index 5bf56ef..ba91dd5 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -246,7 +246,7 @@ async def test_llm_response_mocked_get_soc_code():
         result = await ClassificationLLM(model_name=MODEL_NAME).get_soc_code(
             job_title="teacher",
             job_description="teach children",
-            # level_of_education="degree",
+            level_of_education="degree",
             manage_others=False,
             industry_descr="school",
         )
@@ -457,7 +457,7 @@ async def test_llm_response_mocked_formulate_open_question(
         industry_descr="",
         job_title="",
         job_description="",
-        # level_of_education="",
+        level_of_education="",
         llm_output="",
     )
     assert isinstance(result[0], OpenFollowUp)

From 00988762c67759ad7d5aa83028e2bd834e624d18 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Wed, 3 Jun 2026 15:30:29 +0000
Subject: [PATCH 14/24] Revert "hash level_of_education"

This reverts commit 9117aa1da55597aa26c4e2aa7e92e27f7da918cf.
---
 .../llm/llm.py                                | 30 ++++++++-----------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py
index bbc8deb..de7ad71 100644
--- a/src/occupational_classification_utils/llm/llm.py
+++ b/src/occupational_classification_utils/llm/llm.py
@@ -122,7 +122,7 @@ async def get_soc_code(
         self,
         job_title: str,
         job_description: str,
-        # level_of_education: str,
+        level_of_education: str,
         manage_others: bool,
         industry_descr: str,
     ) -> SocResponse:
@@ -148,7 +148,7 @@ async def get_soc_code(
             {
                 "job_title": job_title,
                 "job_description": job_description,
-                # "level_of_education": level_of_education,
+                "level_of_education": level_of_education,
                 "manage_others": manage_others,
                 "industry_descr": industry_descr,
             },
@@ -391,12 +391,12 @@ async def unambiguous_soc_code(  # noqa: PLR0913
 
         return validated_answer, call_dict
 
-    async def formulate_open_question(
+    async def formulate_open_question(  # noqa: PLR0913
         self,
         industry_descr: str,
         job_title: str | None = None,
         job_description: str | None = None,
-        # level_of_education: str | None = None,
+        level_of_education: str | None = None,
         llm_output: RagCandidate | None = None,
         correlation_id: str | None = None,
     ) -> tuple[OpenFollowUp, Any]:
@@ -421,11 +421,7 @@ async def formulate_open_question(
         """
 
         def prep_call_dict(
-            industry_descr,
-            job_title,
-            job_description,
-            # level_of_education,
-            llm_output,
+            industry_descr, job_title, job_description, level_of_education, llm_output
         ):
             # Helper function to prepare the call dictionary
             is_job_title_present = job_title is None or job_title in {"", " "}
@@ -438,17 +434,17 @@ def prep_call_dict(
             job_description = (
                 "Unknown" if is_job_description_present else job_description
             )
-            # level_of_education = (
-            #     "Unknown"
-            #     if (level_of_education is None or level_of_education in {"", " "})
-            #     else level_of_education
-            # )
+            level_of_education = (
+                "Unknown"
+                if (level_of_education is None or level_of_education in {"", " "})
+                else level_of_education
+            )
 
             call_dict = {
                 "industry_descr": industry_descr,
                 "job_title": job_title,
                 "job_description": job_description,
-                # "level_of_education": level_of_education,
+                "level_of_education": level_of_education,
                 "llm_output": str(llm_output),
             }
             return call_dict
@@ -457,7 +453,7 @@ def prep_call_dict(
             industry_descr=industry_descr,
             job_title=job_title,
             job_description=job_description,
-            # level_of_education=level_of_education,
+            level_of_education=level_of_education,
             llm_output=llm_output,
         )
 
@@ -472,7 +468,7 @@ def prep_call_dict(
             "LLM request sent - formulate_open_question",
             job_title=truncate_identifier(job_title),
             job_description=truncate_identifier(job_description),
-            # level_of_education=truncate_identifier(level_of_education),
+            level_of_education=truncate_identifier(level_of_education),
             industry_descr=truncate_identifier(industry_descr),
             correlation_id=correlation_id or "",
         )

From cb4be4ef2d149ca3a773543a4b59021a970d7455 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Wed, 3 Jun 2026 16:00:01 +0000
Subject: [PATCH 15/24] level of education typehint: optional str

---
 src/occupational_classification_utils/llm/llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py
index de7ad71..d2bd0cd 100644
--- a/src/occupational_classification_utils/llm/llm.py
+++ b/src/occupational_classification_utils/llm/llm.py
@@ -122,7 +122,7 @@ async def get_soc_code(
         self,
         job_title: str,
         job_description: str,
-        level_of_education: str,
+        level_of_education: str | None,
         manage_others: bool,
         industry_descr: str,
     ) -> SocResponse:

From 5d24c2ecf9b59f5106a3bec0089b4585bdde0e7c Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Wed, 3 Jun 2026 16:14:38 +0000
Subject: [PATCH 16/24] add level of education field in the followup quesiton

---
 src/occupational_classification_utils/llm/prompt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py
index 78be492..01aa70a 100644
--- a/src/occupational_classification_utils/llm/prompt.py
+++ b/src/occupational_classification_utils/llm/prompt.py
@@ -229,6 +229,7 @@
 - Company's main activity: {industry_descr}
 - Job title: {job_title}
 - Job description: {job_description}
+- Level of Education: {level_of_education}
 - Shortlist from previous model: {llm_output}
 - Note: These are candidate occupational categories; do not mention codes or "SOC"
 to the respondent.

From 91ab1f13a74dc1d2eda2c4ad3bdb95e2e493e568 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Thu, 4 Jun 2026 16:54:03 +0000
Subject: [PATCH 17/24] level of education for stages 2 and 3 in the pipeline

---
 src/occupational_classification_utils/llm/llm.py    | 13 +++++++++++--
 src/occupational_classification_utils/llm/prompt.py |  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py
index d2bd0cd..c4390e1 100644
--- a/src/occupational_classification_utils/llm/llm.py
+++ b/src/occupational_classification_utils/llm/llm.py
@@ -269,6 +269,7 @@ async def unambiguous_soc_code(  # noqa: PLR0913
         semantic_search_results: list[dict],
         job_title: str | None = None,
         job_description: str | None = None,
+        level_of_education: str | None = None,
         candidates_limit: int = config["llm"]["candidates_limit"],
         code_digits: int = config["llm"]["code_digits"],
         correlation_id: str | None = None,
@@ -293,6 +294,7 @@ async def unambiguous_soc_code(  # noqa: PLR0913
             "industry_descr": industry_descr,
             "job_title": job_title,
             "job_description": job_description,
+            "level_of_education": level_of_education,
             "soc_candidates": soc_candidates,
         }
 
@@ -305,6 +307,7 @@ async def unambiguous_soc_code(  # noqa: PLR0913
             "LLM request sent - unambiguous_soc_code",
             job_title=truncate_identifier(job_title),
             job_description=truncate_identifier(job_description),
+            level_of_education=truncate_identifier(str(level_of_education)),
             industry_descr=truncate_identifier(industry_descr),
             correlation_id=correlation_id or "",
         )
@@ -468,7 +471,7 @@ def prep_call_dict(
             "LLM request sent - formulate_open_question",
             job_title=truncate_identifier(job_title),
             job_description=truncate_identifier(job_description),
-            level_of_education=truncate_identifier(level_of_education),
+            level_of_education=truncate_identifier(str(level_of_education)),
             industry_descr=truncate_identifier(industry_descr),
             correlation_id=correlation_id or "",
         )
@@ -562,6 +565,7 @@ async def sa_rag_soc_code(  # noqa: PLR0913
         industry_descr: str,
         job_title: str | None = None,
         job_description: str | None = None,
+        level_of_education: str | None = None,
         code_digits: int = config["llm"]["code_digits"],
         candidates_limit: int = config["llm"]["candidates_limit"],
         short_list: list[dict[Any, Any]] | None = None,
@@ -575,6 +579,7 @@ async def sa_rag_soc_code(  # noqa: PLR0913
             industry_descr (str): The description of the industry.
             job_title (str, optional): The job title. Defaults to None.
             job_description (str, optional): The job description. Defaults to None.
+            level_of_education (str): The level of education required for the job.
             code_digits (int, optional): The number of digits in the generated
                 SOC code. Defaults to 4.
             candidates_limit (int, optional): The maximum number of SOC code candidates
@@ -592,7 +597,9 @@ async def sa_rag_soc_code(  # noqa: PLR0913
 
         """
 
-        def prep_call_dict(industry_descr, job_title, job_description, soc_codes):
+        def prep_call_dict(
+            industry_descr, job_title, job_description, level_of_education, soc_codes
+        ):
             # Helper function to prepare the call dictionary
             is_job_title_present = job_title is None or job_title in {"", " "}
             job_title = "Unknown" if is_job_title_present else job_title
@@ -609,6 +616,7 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes):
                 "industry_descr": industry_descr,
                 "job_title": job_title,
                 "job_description": job_description,
+                "level_of_education": level_of_education,
                 "soc_index": soc_codes,
             }
             return call_dict
@@ -626,6 +634,7 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes):
             industry_descr=industry_descr,
             job_title=job_title,
             job_description=job_description,
+            level_of_education=level_of_education,
             soc_codes=soc_codes,
         )
 
diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py
index 01aa70a..a289e0c 100644
--- a/src/occupational_classification_utils/llm/prompt.py
+++ b/src/occupational_classification_utils/llm/prompt.py
@@ -197,6 +197,7 @@
 - Company's main activity: {industry_descr}
 - Job Title: {job_title}
 - Job Description: {job_description}
+- Level of Education: {level_of_education}
 
 ===Shortlist===
 {soc_candidates}

From 4fb77660d5bbe80a11beb5f4fa75329b29d50e04 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Thu, 4 Jun 2026 17:06:31 +0000
Subject: [PATCH 18/24] allow zero alternative candidates

---
 .../models/response_model.py                               | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/occupational_classification_utils/models/response_model.py b/src/occupational_classification_utils/models/response_model.py
index 768b56d..177813e 100644
--- a/src/occupational_classification_utils/models/response_model.py
+++ b/src/occupational_classification_utils/models/response_model.py
@@ -314,7 +314,6 @@ class UnambiguousResponse(BaseModel):
         default_factory=list,
         description="Short list of possible classification codes with their "
         "descriptive labels and estimated likelihoods.",
-        min_length=1,  # Ensure there's always at least one candidate
         max_length=10,  # Limit to less than 10 candidates
     )
 
@@ -328,7 +327,7 @@ class UnambiguousResponse(BaseModel):
     def validate_alt_candidates(cls, v):
         """Validates the number of alternative candidates.
 
-        Ensures that the number of candidates is between 1 and the maximum allowed.
+        Ensures that the number of candidates is less or equal to the maximum allowed.
 
         Args:
             v (list): The list of alternative candidates.
@@ -339,8 +338,8 @@ def validate_alt_candidates(cls, v):
         Raises:
             ValueError: If the number of candidates is not within the allowed range.
         """
-        if not 1 <= len(v) <= MAX_ALT_CANDIDATES:
-            raise ValueError("alt_candidates must contain between 1 and 10 items.")
+        if not len(v) <= MAX_ALT_CANDIDATES:
+            raise ValueError("alt_candidates must contain no more than 10 items.")
         return v
 
 

From 88337c01e478613c672ca5f4d56f475ab198c0ce Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Mon, 15 Jun 2026 09:59:17 +0000
Subject: [PATCH 19/24] add level of education to the prompt

---
 src/occupational_classification_utils/llm/prompt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py
index a289e0c..3955d04 100644
--- a/src/occupational_classification_utils/llm/prompt.py
+++ b/src/occupational_classification_utils/llm/prompt.py
@@ -104,6 +104,7 @@
 - Company's main activity: {industry_descr}
 - Job Title: {job_title}
 - Job Description: {job_description}
+- Level of Education: {level_of_education}
 
 ===Relevant subset of UK SOC 2020===
 {soc_index}

From 07c23f14d0c7dc40e402c2b272fb52b20ec7d7d5 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Mon, 15 Jun 2026 15:10:24 +0000
Subject: [PATCH 20/24] create soc lookup

---
 notebooks/create_soc_lookup_2026_04.py | 673 +++++++++++++++++++++++++
 1 file changed, 673 insertions(+)
 create mode 100644 notebooks/create_soc_lookup_2026_04.py

diff --git a/notebooks/create_soc_lookup_2026_04.py b/notebooks/create_soc_lookup_2026_04.py
new file mode 100644
index 0000000..2b246f3
--- /dev/null
+++ b/notebooks/create_soc_lookup_2026_04.py
@@ -0,0 +1,673 @@
+# %%
+# pylint: disable=C0103, C0114, C0301, R0801, W0105
+
+"""Noetbook attempting to create a SOC DIRECT LOOKUP.
+
+Diasbling duplicate code - methods needs to be changed in other repos to reflect the change in data.
+Diasbling line-too-long: commentary and discussion.
+Disabling pointless-string-statement: comments to the code for reading clarity.
+"""
+
+# %%
+import ast
+import re
+
+import dotenv
+import pandas as pd
+
+# %%
+from occupational_classification.data_access.soc_data_access import (
+    _combine_soc_index_job_title as combine_job_title,
+)
+
+# %%
+input_folder = "soc_data"
+
+file_name = "ashe_llm_soc_codes"
+
+file_suffix = "_2026_05_19"
+
+# %%
+# read the data
+data = pd.read_csv(f"notebooks/{input_folder}/{file_name}{file_suffix}.csv")
+
+# %%
+# use only columns needed
+data = data[
+    [
+        "documents",
+        # "corrected_spelling",
+        "label",
+        "codable",
+        "llm_soc_code",
+        "llm_soc_candidates",
+        "reasoning",
+    ]
+]
+
+
+# %%
+def parse_string(text):
+    """Convert string to a list of dictionaries for SOC candidates."""
+    if isinstance(text, str):
+        processed = text.replace("SocCandidate(", "dict(")
+        processed = re.sub(r"(\w+)=", r'"\1":', processed)
+        processed = processed.replace("dict(", "{").replace(")", "}")
+        return ast.literal_eval(processed)
+    return []
+
+
+# %%
+# string to list of dictionaries
+data["llm_soc_candidates"] = data["llm_soc_candidates"].map(parse_string)
+
+# %%
+print(f"llm {data['codable'].value_counts()}")
+
+
+# %%
+def access_soc_code_from_candidate_list(row_values: list[dict]) -> list[str]:
+    """From list of potential SOC candidates, access SOC codes.
+
+    Args:
+        row_values (list[dict]): list of dictionaries with SOC candidates.
+
+    Return:
+        candidates (list[str]): list of 4-digit candidate codes.
+    """
+    if isinstance(row_values, list):
+        candidates = []
+        for row in row_values:
+            if len(row) < 1:
+                return None
+            candidates.append(row.get("soc_code"))
+    else:
+        return None
+    return candidates
+
+
+# %%
+def float_to_list_of_codes(row_values: float) -> str:
+    """Convert float to a string of codes (str).
+
+    Args:
+        row_values (float): SOC code as a float.
+
+    Return:
+        row_values (str): SOC code as a string.
+    """
+    if isinstance(row_values, float):
+        codes_list = [f"{row_values:.0f}"]
+        print(type(codes_list))
+        return codes_list
+    return [row_values]
+
+
+# %%
+data["label"] = data["label"].astype(str)
+
+# %%
+msk = data["llm_soc_code"].isna()  # take rows, where LLM didn't provide a code.
+
+# %%
+data.loc[~msk, "llm_soc_code"] = data.loc[~msk, "llm_soc_code"].apply(
+    float_to_list_of_codes
+)
+
+# %%
+data.loc[msk, "llm_soc_code"] = data.loc[msk, "llm_soc_candidates"].apply(
+    access_soc_code_from_candidate_list
+)
+
+
+# %%
+def check_agreement(df: pd.DataFrame, df_source: str):
+    """Checks agreement between ASHE and LLM assigned codes.
+
+    Args:
+        df (pd.DataFrame): dataframe containing columns 'label' and 'llm_soc_code' with codes.
+        df_source (str): String indicaitng the source of the dataframe (ASHE or soc index).
+    """
+    agr, in_cand = 0, 0
+    # check if 'label' is the same as 'llm_soc_code'.
+    # If LLM uncodable, check if 'label' in candidates.
+    for row in range(len(df)):
+        if len(df.iloc[row]["llm_soc_code"]) == 1:
+            agr += df.iloc[row]["label"] == df.iloc[row]["llm_soc_code"][0]
+            df.loc[row, "codable"] = True
+        elif len(df.iloc[row]["llm_soc_code"]) > 1:
+            in_cand += df.iloc[row]["label"] in df.iloc[row]["llm_soc_code"]
+
+    print(
+        f"Agreement full {df_source}: {agr} ({round(agr / len(df), 2) * 100}% of all rows)"
+    )
+    print(
+        f"Agreement (code in candidates) {df_source}: {in_cand} ({round(in_cand / len(df), 2) * 100}% of all rows)"  # pylint: disable=C0301
+    )
+    print(
+        f"Agreement (label the same or within candidates) {df_source}: {agr + in_cand} ({round((agr + in_cand) / len(df), 2) * 100}% of all rows)"  # pylint: disable=C0301
+    )
+
+
+# %%
+check_agreement(data, "ASHE and LLM")
+
+# %%
+len(data)
+
+
+# %%
+def check_code_count(df: pd.DataFrame, df_source: str):
+    """Check if the LLM assigned a sigle, multiple, or none codes when assessing SOC codes.
+
+    Args:
+        df (pd.DataFrame): dataframe containing LLM assessment of SOC codes.
+            Requires 'llm_soc_code' column.
+        df_source (str): String indicaitng the source of the dataframe (ASHE or soc index).
+    """
+    longer, shorter, one_code = 0, 0, 0
+    for code in df["llm_soc_code"]:
+        if isinstance(code, list):
+            if len(code) > 1:
+                longer += 1
+            if len(code) < 1:
+                shorter += 1
+            if len(code) == 1:
+                one_code += 1
+        else:
+            one_code += 1
+
+    print(
+        f"More than one code {df_source}: {longer} ({round(longer / len(df) * 100, 2)}%)"
+    )
+    print(
+        f"No codes assigned {df_source}: {shorter} ({round(shorter / len(df) * 100, 2)}%)"
+    )
+    print(
+        f"One code assigned {df_source}: {one_code} ({round(one_code / len(df) * 100, 2)}%)"
+    )
+
+
+# %%
+check_code_count(data, "ASHE")
+
+# %%
+full_data_codable = data[data["codable"]]
+
+# %%
+data_only_columns = data[["documents", "llm_soc_code"]]
+
+# %%
+data_one_code = data_only_columns[data_only_columns["llm_soc_code"].str.len() == 1]
+
+# %%
+e = data_one_code["llm_soc_code"].str[0]
+
+# %%
+numeric = pd.to_numeric(e, errors="coerce")
+
+# %%
+data_one_code["llm_soc_code"] = numeric
+
+# %%
+data_one_code = data_one_code.dropna(subset=["llm_soc_code"])
+
+# %%
+data_one_code["llm_soc_code"] = data_one_code["llm_soc_code"].astype(int)
+
+# %%
+data_one_code = data_one_code.rename(
+    columns={"corrected_spelling": "documents", "llm_soc_code": "label"}
+)
+
+# %%
+data_one_code = data_one_code.drop_duplicates(
+    subset=["documents", "label"], keep="last", ignore_index=True
+)
+
+
+# %%
+def load_soc_framework(filepath: str) -> pd.DataFrame:
+    """Load SOC structure.
+
+    Provides structure with all levels and names of the SOC 2020.
+
+    Args:
+        filepath (str): A path to the file containing SOC Structure.
+
+    Returns:
+        pd.DataFrame: A DataFrame containing group code, group title,
+        group description, typical entry routes and associated qualifications,
+        and list of tasks.
+    """
+    soc_df = pd.read_excel(
+        filepath,
+        sheet_name="SOC2020 framework",
+        usecols=[
+            "SOC2020 Unit Group",
+            "SOC2020 Group Title",
+        ],
+        dtype=str,
+    )
+    soc_df.columns = [
+        col.lower().replace(" ", "_").replace("__", "_").replace("\n", "")
+        for col in soc_df.columns
+    ]
+    soc_df = soc_df.rename(
+        columns={"soc2020_unit_group": "code", "soc2020_group_title": "title"}
+    )
+
+    for col in soc_df.columns:
+        soc_df[col] = soc_df[col].str.strip()
+
+    return soc_df
+
+
+# %%
+knowledge_bucket = dotenv.get_key(".env", "KNOWLEDGE_BUCKET")
+
+# %%
+s_list = load_soc_framework(
+    f"{knowledge_bucket}soc2020volume2thecodingindexexcel03122025.xlsx"
+)
+s_list = s_list[s_list["code"].notna()]
+
+# %%
+codes_from_framework_str = list(s_list["code"].value_counts().keys())
+
+# %%
+codes_from_framework_int = []
+for k in codes_from_framework_str:
+    codes_from_framework_int.append(int(k))
+
+# %%
+phantom_codes = (
+    data_one_code[~data_one_code["label"].isin(codes_from_framework_int)]["label"]
+    .value_counts()
+    .keys()
+)
+
+# %%
+print("codes that don't appear in the SOC codes list\n", phantom_codes)
+
+# %%
+data_one_code_no_phantoms = data_one_code[
+    data_one_code["label"].isin(codes_from_framework_int)
+]
+
+# %%
+coded_all = len(data_one_code)
+
+# %%
+coded_no_phantom = len(data_one_code_no_phantoms)
+
+# %%
+diff = len(data_one_code) - len(data_one_code_no_phantoms)
+
+# %%
+drop = diff / coded_all * 100
+
+# %%
+print(
+    f"with phantoms: {coded_all}\ncoded no phantoms: {coded_no_phantom}\ndiff: {diff}\ndrop(%): {drop:.2f}"  # pylint: disable=C0301
+)
+
+# %%
+print(data_one_code_no_phantoms)
+
+# %%
+print(
+    "check if there is any duplicates\n",
+    data_one_code_no_phantoms[
+        data_one_code_no_phantoms.duplicated(subset=["documents"])
+    ],
+)
+
+# %%
+data_one_code_no_phantoms = data_one_code_no_phantoms.drop_duplicates(
+    subset=["documents"], keep="last", ignore_index=True
+)
+
+# %%
+""" data_one_code_no_phantoms contains codes assigned by the LLM. Some of the codes were not present in the SOC codes list, and have been removed.
+Those codes not neccessairly agree with codes initially assigned in ASHE dataset.
+"""
+
+# %%
+# data_one_code_no_phantoms.to_csv("soc_data/SOC_DIRECT_LOOKUP.csv")
+
+# %%
+# data_one_code_no_phantoms.to_csv(f"{knowledge_bucket}SOC_DIRECT_LOOKUP.csv")
+
+# %% [markdown]
+# # AGREEMENT
+
+# %%
+"""Select a subset of codes, where LLM and ASHE assign the same code for a given job title.
+"""
+
+# %%
+msk_codable = data["codable"]
+
+# %%
+data_codable = data[msk_codable]
+
+# %%
+len(data_codable[data_codable["llm_soc_code"].str.len() > 1])
+
+# %%
+len(data_codable[(data_codable["llm_soc_code"].str.len() == 1)])
+
+# %%
+len(
+    data_codable[data_codable["llm_soc_code"].str.len() < 1]
+)  # expect 0 - if is codable, there should be a code available
+
+# %%
+print(data_codable[data_codable["llm_soc_code"].str.len() == 1])
+
+# %%
+one_code_subset = data_codable[data_codable["llm_soc_code"].str.len() == 1]
+
+# %%
+codes_with_agreement = one_code_subset[
+    one_code_subset.apply(lambda r: str(r["label"]) in str(r["llm_soc_code"]), axis=1)
+].reset_index(drop=True)
+
+# %%
+soc_lookup = codes_with_agreement[["documents", "label"]]
+
+# %%
+# save this once all is finished
+
+# %% [markdown]
+# # One code from LLM - why disagreement?
+
+# %%
+full_data_one_code = data[data["llm_soc_code"].str.len() == 1]
+
+# %%
+one_code_disagreement = full_data_one_code[
+    full_data_one_code.apply(
+        lambda r: str(r["label"]) not in str(r["llm_soc_candidates"]), axis=1
+    )
+].reset_index(drop=True)
+
+# %%
+print(one_code_disagreement)
+
+# %%
+"""Look at the cases, where:
+- LLM claims is codable ('codable' == True)
+- ASHE does not agree with LLM ('label' != 'llm_soc_code')
+- ASHE is one of the candidates selected by LLM ('label' in 'llm_soc_candidates')
+"""
+
+
+# %%
+def get_candidates_list(row: pd.Series) -> list:
+    """Get a list of candidates determined by LLM.
+
+    Args:
+        row: pd.Series: row with LLM output
+
+    Returns:
+        list: lsit of candidates.
+    """
+    candidates = []
+    for i in row["llm_soc_candidates"]:
+        candidates.append(i["soc_code"])
+    return candidates
+
+
+# %%
+ashe_llm_disagreement_multi_candidate = one_code_disagreement[
+    one_code_disagreement["llm_soc_candidates"].str.len() > 1
+].reset_index(drop=True)
+
+# %%
+ashe_llm_disagreement_multi_candidate.loc[:, "candidate_list"] = (
+    ashe_llm_disagreement_multi_candidate.apply(get_candidates_list, axis=1)
+)
+
+# %%
+ashe_in_canidates = ashe_llm_disagreement_multi_candidate[
+    ashe_llm_disagreement_multi_candidate.apply(
+        lambda r: str(r["label"]) in str(r["candidate_list"]), axis=1
+    )
+]
+
+# %%
+print(
+    f"""There is {len(ashe_in_canidates)} rows, where code determined by ASHE appears in the cadnidates from LLM, when LLM assessed the job title is codable."""
+)
+
+# %% [markdown]
+# # How many of the rows that are in SOC INDEX have agreement/don't have agreement with ASHE
+
+# %%
+# Access SOC_INDEX data
+soc_coding_index_file = (
+    f"{knowledge_bucket}soc2020volume2thecodingindexexcel03122025.xlsx"
+)
+
+
+# %%
+def load_soc_index(filepath: str) -> pd.DataFrame:
+    """Load SOC index.
+    Provides a list of over 32,000 titles associated with employment.
+
+    Args:
+        filepath (str): A path to the file containing SOC Index.
+
+    Returns:
+        pd.DataFrame: A DataFrame with transformed job titles.
+    """
+    soc_index_df = pd.read_excel(
+        filepath,
+        sheet_name="SOC2020 coding index",
+        usecols=["SOC_2020", "INDEXOCC-natural_word_order", "ADD", "IND"],
+        dtype=str,
+    )
+
+    soc_index_df.columns = [col.lower() for col in soc_index_df.columns]
+
+    soc_index_df = soc_index_df.rename(
+        columns={"indexocc-natural_word_order": "indexocc", "soc_2020": "code"}
+    )
+
+    soc_index_df = soc_index_df[soc_index_df["code"] != "}}}}"]
+    soc_index_df = soc_index_df.dropna(subset=["code", "indexocc"])
+    soc_index_df["title"] = soc_index_df.apply(combine_job_title, axis=1)
+    soc_index_df = soc_index_df[["code", "title"]]
+    soc_index_df["title"] = soc_index_df["title"].str.capitalize()
+
+    return soc_index_df
+
+
+# %%
+soc_list = load_soc_index(soc_coding_index_file)
+
+# %%
+soc_list["title"] = soc_list["title"].str.upper()
+
+# %%
+titles_list = soc_list["title"]
+titles_list = titles_list.to_list()
+
+# %%
+# get subset of the ASHE data that comes from soc_index
+in_list = data[data["documents"].isin(titles_list)].reset_index(drop=True)
+
+# %%
+in_list_codable = in_list[in_list["codable"]]
+
+# %%
+in_list_codable_disagreement = in_list_codable[
+    in_list_codable.apply(
+        lambda r: str(r["label"]) not in str(r["llm_soc_candidates"]), axis=1
+    )
+].reset_index(drop=True)
+in_list_codable_agreement = in_list_codable[
+    in_list_codable.apply(
+        lambda r: str(r["label"]) in str(r["llm_soc_candidates"]), axis=1
+    )
+].reset_index(drop=True)
+
+# %%
+in_list_codable_disagreement_one_code = in_list_codable_disagreement[
+    in_list_codable_disagreement["llm_soc_code"].str.len() == 1
+]
+in_list_codable_agreement_one_code = in_list_codable_agreement[
+    in_list_codable_agreement["llm_soc_code"].str.len() == 1
+]
+
+# %%
+soc_lookup = (
+    pd.concat([soc_lookup, in_list_codable_agreement_one_code[["documents", "label"]]])
+    .drop_duplicates(subset=["documents", "label"])
+    .reset_index(drop=True)
+)
+
+# %% [markdown]
+# # LLM candidates - high likelihood (0.9/0.7)
+
+# %%
+data_multiple_candidates = data[data["llm_soc_candidates"].str.len() > 1].reset_index(
+    drop=True
+)
+
+
+# %%
+def get_high_candidate(row: pd.Series) -> str:
+    """Get a most likely candidate with likelihood greater than 0.9 (assessed by the LLM),
+    where only one candidate got that score.
+
+    Args:
+        row: pd.Series: row with LLM output
+
+    Returns:
+        str: most likely candidate.
+    """
+    high_likelihood = []
+    for i in row["llm_soc_candidates"]:
+        if i["likelihood"] >= 0.9:  # noqa: PLR2004
+            high_likelihood.append(i)
+    if len(high_likelihood) != 1:
+        return None
+    return high_likelihood[0]["soc_code"]
+
+
+# %%
+def get_high_candidate_with_low_other(row: pd.Series) -> str:
+    """Get a most likely candidate with likelihood greater than 0.9 (assessed by the LLM),
+    where only one candidate got that score, and no other candidates got likelihood score above 0.7.
+
+    Args:
+        row: pd.Series: row with LLM output
+
+    Returns:
+        str: most likely candidate.
+    """
+    high_likelihood, lower_likelihood = [], []
+
+    for i in row["llm_soc_candidates"]:
+        if i["likelihood"] >= 0.9:  # noqa: PLR2004
+            high_likelihood.append(i)
+        elif i["likelihood"] >= 0.7:  # noqa: PLR2004
+            lower_likelihood.append(i)
+
+    if len(high_likelihood) != 1 or len(lower_likelihood) > 0:
+        return None
+    return high_likelihood[0]["soc_code"]
+
+
+# %%
+data_multiple_candidates.loc[:, "most_likely_candidate"] = (
+    data_multiple_candidates.apply(get_high_candidate, axis=1)
+)
+
+# %%
+print(len(data_multiple_candidates))
+
+# %%
+data_high_likelihood = data_multiple_candidates[
+    data_multiple_candidates["most_likely_candidate"].notna()
+]
+
+# %%
+data_high_likelihood_agreement = data_high_likelihood[
+    data_high_likelihood.apply(
+        lambda r: str(r["label"]) == str(r["most_likely_candidate"]), axis=1
+    )
+].reset_index(drop=True)
+
+# %%
+print(data_high_likelihood_agreement.iloc[0])
+
+# %%
+misspelled = 0
+for k in data_high_likelihood_agreement["reasoning"]:
+    # print(k)
+    if "misspelling" in k or "misspelled" in k:
+        misspelled += 1
+
+# %%
+print(misspelled)
+
+# %%
+print(
+    f"""We looked at the rows, where the LLM decided thre is more than one possible SOC code candidate {len(data_multiple_candidates)} codes ({round((len(data_multiple_candidates) / len(data)) * 100, 2)}% of all codes).
+To that subset of data, we added a new column 'most_likely_candidate'.
+It was populated with codes, that were assessed to have a high (0.9) likelihood.
+If the LLM assigned more than one code with high likelihood, those were disregarded,
+as there is no way to determine which code is more likely, according to the LLM, meaning it is not unambiguous.
+
+Only one candidate with 0.9 likelihood was assigned for {len(data_high_likelihood)} rows.
+
+Next, we compared the agreement between the label assigned in the original data with the "most_likely_candidate",
+which resulted in {len(data_high_likelihood_agreement)} cases.
+"""
+)
+
+# %%
+data_high = data[data["llm_soc_candidates"].str.len() > 1].reset_index(drop=True)
+
+# %%
+data_high.loc[:, "most_likely_candidate"] = data_high.apply(
+    get_high_candidate_with_low_other, axis=1
+)
+
+# %%
+data_high_notna = data_high[data_high["most_likely_candidate"].notna()]
+
+# %%
+data_high_notna_agreement = data_high_notna[
+    data_high_notna.apply(
+        lambda r: str(r["label"]) == str(r["most_likely_candidate"]), axis=1
+    )
+].reset_index(drop=True)
+
+# %%
+soc_lookup = (
+    pd.concat([soc_lookup, data_high_notna_agreement[["documents", "label"]]])
+    .drop_duplicates(subset=["documents", "label"])
+    .reset_index(drop=True)
+)
+
+# %%
+# soc_lookup.to_csv(f"{knowledge_bucket}wip_data/SOC_DIRECT_LOOKUP.csv")
+
+# %%
+# data_one_code_no_phantoms
+len(soc_lookup)
+
+# %%
+soc_copy = (
+    pd.concat([soc_lookup, data_one_code_no_phantoms[["documents", "label"]]])
+    .drop_duplicates(subset=["documents"])
+    .reset_index(drop=True)
+)
+
+# %%
+len(soc_copy)

From cf5cb1fbd508dd73162fd8494a7cbc5cdbec0c98 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Tue, 16 Jun 2026 08:26:28 +0000
Subject: [PATCH 21/24] Revert "add level of education to the prompt"

This reverts commit 88337c01e478613c672ca5f4d56f475ab198c0ce.
---
 src/occupational_classification_utils/llm/prompt.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py
index 3955d04..a289e0c 100644
--- a/src/occupational_classification_utils/llm/prompt.py
+++ b/src/occupational_classification_utils/llm/prompt.py
@@ -104,7 +104,6 @@
 - Company's main activity: {industry_descr}
 - Job Title: {job_title}
 - Job Description: {job_description}
-- Level of Education: {level_of_education}
 
 ===Relevant subset of UK SOC 2020===
 {soc_index}

From 29c57121ff8bcac5ee7244746d0189cb246c7706 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Tue, 16 Jun 2026 08:29:23 +0000
Subject: [PATCH 22/24] Revert "create soc lookup"

This reverts commit 07c23f14d0c7dc40e402c2b272fb52b20ec7d7d5.
---
 notebooks/create_soc_lookup_2026_04.py | 673 -------------------------
 1 file changed, 673 deletions(-)
 delete mode 100644 notebooks/create_soc_lookup_2026_04.py

diff --git a/notebooks/create_soc_lookup_2026_04.py b/notebooks/create_soc_lookup_2026_04.py
deleted file mode 100644
index 2b246f3..0000000
--- a/notebooks/create_soc_lookup_2026_04.py
+++ /dev/null
@@ -1,673 +0,0 @@
-# %%
-# pylint: disable=C0103, C0114, C0301, R0801, W0105
-
-"""Noetbook attempting to create a SOC DIRECT LOOKUP.
-
-Diasbling duplicate code - methods needs to be changed in other repos to reflect the change in data.
-Diasbling line-too-long: commentary and discussion.
-Disabling pointless-string-statement: comments to the code for reading clarity.
-"""
-
-# %%
-import ast
-import re
-
-import dotenv
-import pandas as pd
-
-# %%
-from occupational_classification.data_access.soc_data_access import (
-    _combine_soc_index_job_title as combine_job_title,
-)
-
-# %%
-input_folder = "soc_data"
-
-file_name = "ashe_llm_soc_codes"
-
-file_suffix = "_2026_05_19"
-
-# %%
-# read the data
-data = pd.read_csv(f"notebooks/{input_folder}/{file_name}{file_suffix}.csv")
-
-# %%
-# use only columns needed
-data = data[
-    [
-        "documents",
-        # "corrected_spelling",
-        "label",
-        "codable",
-        "llm_soc_code",
-        "llm_soc_candidates",
-        "reasoning",
-    ]
-]
-
-
-# %%
-def parse_string(text):
-    """Convert string to a list of dictionaries for SOC candidates."""
-    if isinstance(text, str):
-        processed = text.replace("SocCandidate(", "dict(")
-        processed = re.sub(r"(\w+)=", r'"\1":', processed)
-        processed = processed.replace("dict(", "{").replace(")", "}")
-        return ast.literal_eval(processed)
-    return []
-
-
-# %%
-# string to list of dictionaries
-data["llm_soc_candidates"] = data["llm_soc_candidates"].map(parse_string)
-
-# %%
-print(f"llm {data['codable'].value_counts()}")
-
-
-# %%
-def access_soc_code_from_candidate_list(row_values: list[dict]) -> list[str]:
-    """From list of potential SOC candidates, access SOC codes.
-
-    Args:
-        row_values (list[dict]): list of dictionaries with SOC candidates.
-
-    Return:
-        candidates (list[str]): list of 4-digit candidate codes.
-    """
-    if isinstance(row_values, list):
-        candidates = []
-        for row in row_values:
-            if len(row) < 1:
-                return None
-            candidates.append(row.get("soc_code"))
-    else:
-        return None
-    return candidates
-
-
-# %%
-def float_to_list_of_codes(row_values: float) -> str:
-    """Convert float to a string of codes (str).
-
-    Args:
-        row_values (float): SOC code as a float.
-
-    Return:
-        row_values (str): SOC code as a string.
-    """
-    if isinstance(row_values, float):
-        codes_list = [f"{row_values:.0f}"]
-        print(type(codes_list))
-        return codes_list
-    return [row_values]
-
-
-# %%
-data["label"] = data["label"].astype(str)
-
-# %%
-msk = data["llm_soc_code"].isna()  # take rows, where LLM didn't provide a code.
-
-# %%
-data.loc[~msk, "llm_soc_code"] = data.loc[~msk, "llm_soc_code"].apply(
-    float_to_list_of_codes
-)
-
-# %%
-data.loc[msk, "llm_soc_code"] = data.loc[msk, "llm_soc_candidates"].apply(
-    access_soc_code_from_candidate_list
-)
-
-
-# %%
-def check_agreement(df: pd.DataFrame, df_source: str):
-    """Checks agreement between ASHE and LLM assigned codes.
-
-    Args:
-        df (pd.DataFrame): dataframe containing columns 'label' and 'llm_soc_code' with codes.
-        df_source (str): String indicaitng the source of the dataframe (ASHE or soc index).
-    """
-    agr, in_cand = 0, 0
-    # check if 'label' is the same as 'llm_soc_code'.
-    # If LLM uncodable, check if 'label' in candidates.
-    for row in range(len(df)):
-        if len(df.iloc[row]["llm_soc_code"]) == 1:
-            agr += df.iloc[row]["label"] == df.iloc[row]["llm_soc_code"][0]
-            df.loc[row, "codable"] = True
-        elif len(df.iloc[row]["llm_soc_code"]) > 1:
-            in_cand += df.iloc[row]["label"] in df.iloc[row]["llm_soc_code"]
-
-    print(
-        f"Agreement full {df_source}: {agr} ({round(agr / len(df), 2) * 100}% of all rows)"
-    )
-    print(
-        f"Agreement (code in candidates) {df_source}: {in_cand} ({round(in_cand / len(df), 2) * 100}% of all rows)"  # pylint: disable=C0301
-    )
-    print(
-        f"Agreement (label the same or within candidates) {df_source}: {agr + in_cand} ({round((agr + in_cand) / len(df), 2) * 100}% of all rows)"  # pylint: disable=C0301
-    )
-
-
-# %%
-check_agreement(data, "ASHE and LLM")
-
-# %%
-len(data)
-
-
-# %%
-def check_code_count(df: pd.DataFrame, df_source: str):
-    """Check if the LLM assigned a sigle, multiple, or none codes when assessing SOC codes.
-
-    Args:
-        df (pd.DataFrame): dataframe containing LLM assessment of SOC codes.
-            Requires 'llm_soc_code' column.
-        df_source (str): String indicaitng the source of the dataframe (ASHE or soc index).
-    """
-    longer, shorter, one_code = 0, 0, 0
-    for code in df["llm_soc_code"]:
-        if isinstance(code, list):
-            if len(code) > 1:
-                longer += 1
-            if len(code) < 1:
-                shorter += 1
-            if len(code) == 1:
-                one_code += 1
-        else:
-            one_code += 1
-
-    print(
-        f"More than one code {df_source}: {longer} ({round(longer / len(df) * 100, 2)}%)"
-    )
-    print(
-        f"No codes assigned {df_source}: {shorter} ({round(shorter / len(df) * 100, 2)}%)"
-    )
-    print(
-        f"One code assigned {df_source}: {one_code} ({round(one_code / len(df) * 100, 2)}%)"
-    )
-
-
-# %%
-check_code_count(data, "ASHE")
-
-# %%
-full_data_codable = data[data["codable"]]
-
-# %%
-data_only_columns = data[["documents", "llm_soc_code"]]
-
-# %%
-data_one_code = data_only_columns[data_only_columns["llm_soc_code"].str.len() == 1]
-
-# %%
-e = data_one_code["llm_soc_code"].str[0]
-
-# %%
-numeric = pd.to_numeric(e, errors="coerce")
-
-# %%
-data_one_code["llm_soc_code"] = numeric
-
-# %%
-data_one_code = data_one_code.dropna(subset=["llm_soc_code"])
-
-# %%
-data_one_code["llm_soc_code"] = data_one_code["llm_soc_code"].astype(int)
-
-# %%
-data_one_code = data_one_code.rename(
-    columns={"corrected_spelling": "documents", "llm_soc_code": "label"}
-)
-
-# %%
-data_one_code = data_one_code.drop_duplicates(
-    subset=["documents", "label"], keep="last", ignore_index=True
-)
-
-
-# %%
-def load_soc_framework(filepath: str) -> pd.DataFrame:
-    """Load SOC structure.
-
-    Provides structure with all levels and names of the SOC 2020.
-
-    Args:
-        filepath (str): A path to the file containing SOC Structure.
-
-    Returns:
-        pd.DataFrame: A DataFrame containing group code, group title,
-        group description, typical entry routes and associated qualifications,
-        and list of tasks.
-    """
-    soc_df = pd.read_excel(
-        filepath,
-        sheet_name="SOC2020 framework",
-        usecols=[
-            "SOC2020 Unit Group",
-            "SOC2020 Group Title",
-        ],
-        dtype=str,
-    )
-    soc_df.columns = [
-        col.lower().replace(" ", "_").replace("__", "_").replace("\n", "")
-        for col in soc_df.columns
-    ]
-    soc_df = soc_df.rename(
-        columns={"soc2020_unit_group": "code", "soc2020_group_title": "title"}
-    )
-
-    for col in soc_df.columns:
-        soc_df[col] = soc_df[col].str.strip()
-
-    return soc_df
-
-
-# %%
-knowledge_bucket = dotenv.get_key(".env", "KNOWLEDGE_BUCKET")
-
-# %%
-s_list = load_soc_framework(
-    f"{knowledge_bucket}soc2020volume2thecodingindexexcel03122025.xlsx"
-)
-s_list = s_list[s_list["code"].notna()]
-
-# %%
-codes_from_framework_str = list(s_list["code"].value_counts().keys())
-
-# %%
-codes_from_framework_int = []
-for k in codes_from_framework_str:
-    codes_from_framework_int.append(int(k))
-
-# %%
-phantom_codes = (
-    data_one_code[~data_one_code["label"].isin(codes_from_framework_int)]["label"]
-    .value_counts()
-    .keys()
-)
-
-# %%
-print("codes that don't appear in the SOC codes list\n", phantom_codes)
-
-# %%
-data_one_code_no_phantoms = data_one_code[
-    data_one_code["label"].isin(codes_from_framework_int)
-]
-
-# %%
-coded_all = len(data_one_code)
-
-# %%
-coded_no_phantom = len(data_one_code_no_phantoms)
-
-# %%
-diff = len(data_one_code) - len(data_one_code_no_phantoms)
-
-# %%
-drop = diff / coded_all * 100
-
-# %%
-print(
-    f"with phantoms: {coded_all}\ncoded no phantoms: {coded_no_phantom}\ndiff: {diff}\ndrop(%): {drop:.2f}"  # pylint: disable=C0301
-)
-
-# %%
-print(data_one_code_no_phantoms)
-
-# %%
-print(
-    "check if there is any duplicates\n",
-    data_one_code_no_phantoms[
-        data_one_code_no_phantoms.duplicated(subset=["documents"])
-    ],
-)
-
-# %%
-data_one_code_no_phantoms = data_one_code_no_phantoms.drop_duplicates(
-    subset=["documents"], keep="last", ignore_index=True
-)
-
-# %%
-""" data_one_code_no_phantoms contains codes assigned by the LLM. Some of the codes were not present in the SOC codes list, and have been removed.
-Those codes not neccessairly agree with codes initially assigned in ASHE dataset.
-"""
-
-# %%
-# data_one_code_no_phantoms.to_csv("soc_data/SOC_DIRECT_LOOKUP.csv")
-
-# %%
-# data_one_code_no_phantoms.to_csv(f"{knowledge_bucket}SOC_DIRECT_LOOKUP.csv")
-
-# %% [markdown]
-# # AGREEMENT
-
-# %%
-"""Select a subset of codes, where LLM and ASHE assign the same code for a given job title.
-"""
-
-# %%
-msk_codable = data["codable"]
-
-# %%
-data_codable = data[msk_codable]
-
-# %%
-len(data_codable[data_codable["llm_soc_code"].str.len() > 1])
-
-# %%
-len(data_codable[(data_codable["llm_soc_code"].str.len() == 1)])
-
-# %%
-len(
-    data_codable[data_codable["llm_soc_code"].str.len() < 1]
-)  # expect 0 - if is codable, there should be a code available
-
-# %%
-print(data_codable[data_codable["llm_soc_code"].str.len() == 1])
-
-# %%
-one_code_subset = data_codable[data_codable["llm_soc_code"].str.len() == 1]
-
-# %%
-codes_with_agreement = one_code_subset[
-    one_code_subset.apply(lambda r: str(r["label"]) in str(r["llm_soc_code"]), axis=1)
-].reset_index(drop=True)
-
-# %%
-soc_lookup = codes_with_agreement[["documents", "label"]]
-
-# %%
-# save this once all is finished
-
-# %% [markdown]
-# # One code from LLM - why disagreement?
-
-# %%
-full_data_one_code = data[data["llm_soc_code"].str.len() == 1]
-
-# %%
-one_code_disagreement = full_data_one_code[
-    full_data_one_code.apply(
-        lambda r: str(r["label"]) not in str(r["llm_soc_candidates"]), axis=1
-    )
-].reset_index(drop=True)
-
-# %%
-print(one_code_disagreement)
-
-# %%
-"""Look at the cases, where:
-- LLM claims is codable ('codable' == True)
-- ASHE does not agree with LLM ('label' != 'llm_soc_code')
-- ASHE is one of the candidates selected by LLM ('label' in 'llm_soc_candidates')
-"""
-
-
-# %%
-def get_candidates_list(row: pd.Series) -> list:
-    """Get a list of candidates determined by LLM.
-
-    Args:
-        row: pd.Series: row with LLM output
-
-    Returns:
-        list: lsit of candidates.
-    """
-    candidates = []
-    for i in row["llm_soc_candidates"]:
-        candidates.append(i["soc_code"])
-    return candidates
-
-
-# %%
-ashe_llm_disagreement_multi_candidate = one_code_disagreement[
-    one_code_disagreement["llm_soc_candidates"].str.len() > 1
-].reset_index(drop=True)
-
-# %%
-ashe_llm_disagreement_multi_candidate.loc[:, "candidate_list"] = (
-    ashe_llm_disagreement_multi_candidate.apply(get_candidates_list, axis=1)
-)
-
-# %%
-ashe_in_canidates = ashe_llm_disagreement_multi_candidate[
-    ashe_llm_disagreement_multi_candidate.apply(
-        lambda r: str(r["label"]) in str(r["candidate_list"]), axis=1
-    )
-]
-
-# %%
-print(
-    f"""There is {len(ashe_in_canidates)} rows, where code determined by ASHE appears in the cadnidates from LLM, when LLM assessed the job title is codable."""
-)
-
-# %% [markdown]
-# # How many of the rows that are in SOC INDEX have agreement/don't have agreement with ASHE
-
-# %%
-# Access SOC_INDEX data
-soc_coding_index_file = (
-    f"{knowledge_bucket}soc2020volume2thecodingindexexcel03122025.xlsx"
-)
-
-
-# %%
-def load_soc_index(filepath: str) -> pd.DataFrame:
-    """Load SOC index.
-    Provides a list of over 32,000 titles associated with employment.
-
-    Args:
-        filepath (str): A path to the file containing SOC Index.
-
-    Returns:
-        pd.DataFrame: A DataFrame with transformed job titles.
-    """
-    soc_index_df = pd.read_excel(
-        filepath,
-        sheet_name="SOC2020 coding index",
-        usecols=["SOC_2020", "INDEXOCC-natural_word_order", "ADD", "IND"],
-        dtype=str,
-    )
-
-    soc_index_df.columns = [col.lower() for col in soc_index_df.columns]
-
-    soc_index_df = soc_index_df.rename(
-        columns={"indexocc-natural_word_order": "indexocc", "soc_2020": "code"}
-    )
-
-    soc_index_df = soc_index_df[soc_index_df["code"] != "}}}}"]
-    soc_index_df = soc_index_df.dropna(subset=["code", "indexocc"])
-    soc_index_df["title"] = soc_index_df.apply(combine_job_title, axis=1)
-    soc_index_df = soc_index_df[["code", "title"]]
-    soc_index_df["title"] = soc_index_df["title"].str.capitalize()
-
-    return soc_index_df
-
-
-# %%
-soc_list = load_soc_index(soc_coding_index_file)
-
-# %%
-soc_list["title"] = soc_list["title"].str.upper()
-
-# %%
-titles_list = soc_list["title"]
-titles_list = titles_list.to_list()
-
-# %%
-# get subset of the ASHE data that comes from soc_index
-in_list = data[data["documents"].isin(titles_list)].reset_index(drop=True)
-
-# %%
-in_list_codable = in_list[in_list["codable"]]
-
-# %%
-in_list_codable_disagreement = in_list_codable[
-    in_list_codable.apply(
-        lambda r: str(r["label"]) not in str(r["llm_soc_candidates"]), axis=1
-    )
-].reset_index(drop=True)
-in_list_codable_agreement = in_list_codable[
-    in_list_codable.apply(
-        lambda r: str(r["label"]) in str(r["llm_soc_candidates"]), axis=1
-    )
-].reset_index(drop=True)
-
-# %%
-in_list_codable_disagreement_one_code = in_list_codable_disagreement[
-    in_list_codable_disagreement["llm_soc_code"].str.len() == 1
-]
-in_list_codable_agreement_one_code = in_list_codable_agreement[
-    in_list_codable_agreement["llm_soc_code"].str.len() == 1
-]
-
-# %%
-soc_lookup = (
-    pd.concat([soc_lookup, in_list_codable_agreement_one_code[["documents", "label"]]])
-    .drop_duplicates(subset=["documents", "label"])
-    .reset_index(drop=True)
-)
-
-# %% [markdown]
-# # LLM candidates - high likelihood (0.9/0.7)
-
-# %%
-data_multiple_candidates = data[data["llm_soc_candidates"].str.len() > 1].reset_index(
-    drop=True
-)
-
-
-# %%
-def get_high_candidate(row: pd.Series) -> str:
-    """Get a most likely candidate with likelihood greater than 0.9 (assessed by the LLM),
-    where only one candidate got that score.
-
-    Args:
-        row: pd.Series: row with LLM output
-
-    Returns:
-        str: most likely candidate.
-    """
-    high_likelihood = []
-    for i in row["llm_soc_candidates"]:
-        if i["likelihood"] >= 0.9:  # noqa: PLR2004
-            high_likelihood.append(i)
-    if len(high_likelihood) != 1:
-        return None
-    return high_likelihood[0]["soc_code"]
-
-
-# %%
-def get_high_candidate_with_low_other(row: pd.Series) -> str:
-    """Get a most likely candidate with likelihood greater than 0.9 (assessed by the LLM),
-    where only one candidate got that score, and no other candidates got likelihood score above 0.7.
-
-    Args:
-        row: pd.Series: row with LLM output
-
-    Returns:
-        str: most likely candidate.
-    """
-    high_likelihood, lower_likelihood = [], []
-
-    for i in row["llm_soc_candidates"]:
-        if i["likelihood"] >= 0.9:  # noqa: PLR2004
-            high_likelihood.append(i)
-        elif i["likelihood"] >= 0.7:  # noqa: PLR2004
-            lower_likelihood.append(i)
-
-    if len(high_likelihood) != 1 or len(lower_likelihood) > 0:
-        return None
-    return high_likelihood[0]["soc_code"]
-
-
-# %%
-data_multiple_candidates.loc[:, "most_likely_candidate"] = (
-    data_multiple_candidates.apply(get_high_candidate, axis=1)
-)
-
-# %%
-print(len(data_multiple_candidates))
-
-# %%
-data_high_likelihood = data_multiple_candidates[
-    data_multiple_candidates["most_likely_candidate"].notna()
-]
-
-# %%
-data_high_likelihood_agreement = data_high_likelihood[
-    data_high_likelihood.apply(
-        lambda r: str(r["label"]) == str(r["most_likely_candidate"]), axis=1
-    )
-].reset_index(drop=True)
-
-# %%
-print(data_high_likelihood_agreement.iloc[0])
-
-# %%
-misspelled = 0
-for k in data_high_likelihood_agreement["reasoning"]:
-    # print(k)
-    if "misspelling" in k or "misspelled" in k:
-        misspelled += 1
-
-# %%
-print(misspelled)
-
-# %%
-print(
-    f"""We looked at the rows, where the LLM decided thre is more than one possible SOC code candidate {len(data_multiple_candidates)} codes ({round((len(data_multiple_candidates) / len(data)) * 100, 2)}% of all codes).
-To that subset of data, we added a new column 'most_likely_candidate'.
-It was populated with codes, that were assessed to have a high (0.9) likelihood.
-If the LLM assigned more than one code with high likelihood, those were disregarded,
-as there is no way to determine which code is more likely, according to the LLM, meaning it is not unambiguous.
-
-Only one candidate with 0.9 likelihood was assigned for {len(data_high_likelihood)} rows.
-
-Next, we compared the agreement between the label assigned in the original data with the "most_likely_candidate",
-which resulted in {len(data_high_likelihood_agreement)} cases.
-"""
-)
-
-# %%
-data_high = data[data["llm_soc_candidates"].str.len() > 1].reset_index(drop=True)
-
-# %%
-data_high.loc[:, "most_likely_candidate"] = data_high.apply(
-    get_high_candidate_with_low_other, axis=1
-)
-
-# %%
-data_high_notna = data_high[data_high["most_likely_candidate"].notna()]
-
-# %%
-data_high_notna_agreement = data_high_notna[
-    data_high_notna.apply(
-        lambda r: str(r["label"]) == str(r["most_likely_candidate"]), axis=1
-    )
-].reset_index(drop=True)
-
-# %%
-soc_lookup = (
-    pd.concat([soc_lookup, data_high_notna_agreement[["documents", "label"]]])
-    .drop_duplicates(subset=["documents", "label"])
-    .reset_index(drop=True)
-)
-
-# %%
-# soc_lookup.to_csv(f"{knowledge_bucket}wip_data/SOC_DIRECT_LOOKUP.csv")
-
-# %%
-# data_one_code_no_phantoms
-len(soc_lookup)
-
-# %%
-soc_copy = (
-    pd.concat([soc_lookup, data_one_code_no_phantoms[["documents", "label"]]])
-    .drop_duplicates(subset=["documents"])
-    .reset_index(drop=True)
-)
-
-# %%
-len(soc_copy)

From be20c2ed0644c980737afbeb522d041860fd35a5 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Tue, 16 Jun 2026 08:31:37 +0000
Subject: [PATCH 23/24] add level of education

---
 src/occupational_classification_utils/llm/prompt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/occupational_classification_utils/llm/prompt.py b/src/occupational_classification_utils/llm/prompt.py
index a289e0c..3955d04 100644
--- a/src/occupational_classification_utils/llm/prompt.py
+++ b/src/occupational_classification_utils/llm/prompt.py
@@ -104,6 +104,7 @@
 - Company's main activity: {industry_descr}
 - Job Title: {job_title}
 - Job Description: {job_description}
+- Level of Education: {level_of_education}
 
 ===Relevant subset of UK SOC 2020===
 {soc_index}

From e8c5773b4b292ae38635c87acb16e939a1107969 Mon Sep 17 00:00:00 2001
From: peter-spencer-ons <peter.spencer@ext.ons.gov.uk>
Date: Mon, 22 Jun 2026 18:55:31 +0000
Subject: [PATCH 24/24] include level_of_education in two prompt pipeline

---
 src/occupational_classification_utils/llm/llm.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/occupational_classification_utils/llm/llm.py b/src/occupational_classification_utils/llm/llm.py
index c4390e1..aa7cc83 100644
--- a/src/occupational_classification_utils/llm/llm.py
+++ b/src/occupational_classification_utils/llm/llm.py
@@ -289,6 +289,11 @@ async def unambiguous_soc_code(  # noqa: PLR0913
             if (job_description is None or job_description in {"", " "})
             else job_description
         )
+        level_of_education = (
+            "Unknown"
+            if (level_of_education is None or level_of_education in {"", " "})
+            else level_of_education
+        )
 
         call_dict = {
             "industry_descr": industry_descr,