Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
0ffe782
Add prompt and validation model for initial SOC classificaiton
peter-spencer-ons May 11, 2026
1db4e81
add llm method for initial SOC classification
peter-spencer-ons May 11, 2026
b54238d
add llm method, prompt and validation for stage 3
peter-spencer-ons May 12, 2026
23a5087
resolve merging problems
peter-spencer-ons May 29, 2026
0a5b907
reflect types as in ruff
peter-spencer-ons May 29, 2026
7d76e93
add tests to meet 80% coverage
peter-spencer-ons May 29, 2026
d50951a
merge main
peter-spencer-ons Jun 2, 2026
40f0926
reorder llm.py to reflect changes made in main
peter-spencer-ons Jun 2, 2026
b419c6c
Use more general validation method
peter-spencer-ons Jun 2, 2026
a9b7c45
remove duplications in prompt.py
peter-spencer-ons Jun 2, 2026
7f77306
uncomment code in llm.py
peter-spencer-ons Jun 2, 2026
0fbf380
correct validation method
peter-spencer-ons Jun 2, 2026
9117aa1
hash level_of_education
peter-spencer-ons Jun 3, 2026
0788eb9
remove level of education from test and prompt
peter-spencer-ons Jun 3, 2026
0179a00
Revert "remove level of education from test and prompt"
peter-spencer-ons Jun 3, 2026
0098876
Revert "hash level_of_education"
peter-spencer-ons Jun 3, 2026
cb4be4e
level of education typehint: optional str
peter-spencer-ons Jun 3, 2026
5d24c2e
add level of education field in the followup quesiton
peter-spencer-ons Jun 3, 2026
91ab1f1
level of education for stages 2 and 3 in the pipeline
peter-spencer-ons Jun 4, 2026
4fb7766
allow zero alternative candidates
peter-spencer-ons Jun 4, 2026
88337c0
add level of education to the prompt
peter-spencer-ons Jun 15, 2026
07c23f1
create soc lookup
peter-spencer-ons Jun 15, 2026
cf5cb1f
Revert "add level of education to the prompt"
peter-spencer-ons Jun 16, 2026
29c5712
Revert "create soc lookup"
peter-spencer-ons Jun 16, 2026
be20c2e
add level of education
peter-spencer-ons Jun 16, 2026
e8c5773
include level_of_education in two prompt pipeline
peter-spencer-ons Jun 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 61 additions & 9 deletions src/occupational_classification_utils/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ async def get_soc_code(
self,
job_title: str,
job_description: str,
level_of_education: str,
level_of_education: str | None,
manage_others: bool,
industry_descr: str,
) -> SocResponse:
Expand Down Expand Up @@ -269,6 +269,7 @@ async def unambiguous_soc_code( # noqa: PLR0913
semantic_search_results: list[dict],
job_title: str | None = None,
job_description: str | None = None,
level_of_education: str | None = None,
candidates_limit: int = config["llm"]["candidates_limit"],
code_digits: int = config["llm"]["code_digits"],
correlation_id: str | None = None,
Expand All @@ -288,11 +289,17 @@ async def unambiguous_soc_code( # noqa: PLR0913
if (job_description is None or job_description in {"", " "})
else job_description
)
level_of_education = (
"Unknown"
if (level_of_education is None or level_of_education in {"", " "})
else level_of_education
)

call_dict = {
"industry_descr": industry_descr,
"job_title": job_title,
"job_description": job_description,
"level_of_education": level_of_education,
"soc_candidates": soc_candidates,
}

Expand All @@ -305,6 +312,7 @@ async def unambiguous_soc_code( # noqa: PLR0913
"LLM request sent - unambiguous_soc_code",
job_title=truncate_identifier(job_title),
job_description=truncate_identifier(job_description),
level_of_education=truncate_identifier(str(level_of_education)),
industry_descr=truncate_identifier(industry_descr),
correlation_id=correlation_id or "",
)
Expand Down Expand Up @@ -391,37 +399,69 @@ async def unambiguous_soc_code( # noqa: PLR0913

return validated_answer, call_dict

async def formulate_open_question(
async def formulate_open_question( # noqa: PLR0913
self,
industry_descr: str,
job_title: str | None = None,
job_description: str | None = None,
level_of_education: str | None = None,
llm_output: RagCandidate | None = None,
correlation_id: str | None = None,
) -> tuple[OpenFollowUp, dict[str, Any]]:
"""Formulate an open-ended follow-up (mirrors SIC formulate_open_question)."""
) -> tuple[OpenFollowUp, Any]:
"""Formulates an open-ended question using respondent data and survey design guidelines.

Args:
industry_descr (str): The description of the industry.
job_title (str, optional): The job title. Defaults to None.
job_description (str, optional): The job description. Defaults to None.
level_of_education (str, optional): The level od education. Defaults to None.
llm_output (RagCandidate, optional): The response from the LLM model.
correlation_id (str, optional): Optional correlation ID for request tracking.

def prep_call_dict(industry_descr, job_title, job_description, llm_output):
Returns:
OpenFollowUp: The generated response to the query.

Raises:
ValueError: If there is an error during the parsing of the response.
ValueError: If the default embedding handler is required but
not loaded correctly.

"""

def prep_call_dict(
industry_descr, job_title, job_description, level_of_education, llm_output
):
# Helper function to prepare the call dictionary
is_job_title_present = job_title is None or job_title in {"", " "}
job_title = "Unknown" if is_job_title_present else job_title

is_job_description_present = job_description is None or job_description in {
"",
" ",
}
job_description = (
"Unknown" if is_job_description_present else job_description
)
return {
level_of_education = (
"Unknown"
if (level_of_education is None or level_of_education in {"", " "})
else level_of_education
)

call_dict = {
"industry_descr": industry_descr,
"job_title": job_title,
"job_description": job_description,
"level_of_education": level_of_education,
"llm_output": str(llm_output),
}
return call_dict

call_dict = prep_call_dict(
industry_descr=industry_descr,
job_title=job_title,
job_description=job_description,
level_of_education=level_of_education,
llm_output=llm_output,
)

Expand All @@ -430,10 +470,13 @@ def prep_call_dict(industry_descr, job_title, job_description, llm_output):
logger.debug(final_prompt)

chain = self.soc_prompt_openfollowup | self.llm

# Log LLM request sent
logger.info(
"LLM request sent - formulate_open_question",
job_title=truncate_identifier(job_title),
job_description=truncate_identifier(job_description),
level_of_education=truncate_identifier(str(level_of_education)),
industry_descr=truncate_identifier(industry_descr),
correlation_id=correlation_id or "",
)
Expand All @@ -459,9 +502,11 @@ def prep_call_dict(industry_descr, job_title, job_description, llm_output):

llm_duration_ms = int((time.perf_counter() - llm_start) * 1000)

# Parse the output to the desired format
parser = PydanticOutputParser(pydantic_object=OpenFollowUp)
try:
validated_answer = parser.parse(str(response.content))
# Log LLM response received after successful parse
has_followup = bool(getattr(validated_answer, "followup", None))
logger.info(
"LLM response received for open question prompt",
Expand All @@ -487,8 +532,8 @@ def prep_call_dict(industry_descr, job_title, job_description, llm_output):
correlation_id=correlation_id or "",
)
try:
fix_chain = FIX_PARSING_PROMPT | self.llm
response = await fix_chain.ainvoke(
chain = FIX_PARSING_PROMPT | self.llm
response = await chain.ainvoke(
{
"llm_output": str(response.content),
"format_instructions": parser.get_format_instructions(),
Expand All @@ -497,6 +542,7 @@ def prep_call_dict(industry_descr, job_title, job_description, llm_output):
)
validated_answer = parser.parse(str(response.content))
logger.debug("Successfully parsed reformatted response.")

except (ValueError, AttributeError) as parse_error2:
logger.error(
f"Failed to parse response again: {parse_error2}",
Expand Down Expand Up @@ -524,6 +570,7 @@ async def sa_rag_soc_code( # noqa: PLR0913
industry_descr: str,
job_title: str | None = None,
job_description: str | None = None,
level_of_education: str | None = None,

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you have added the level of edu to the sa_rag_soc_code method but not to its prompt

code_digits: int = config["llm"]["code_digits"],
candidates_limit: int = config["llm"]["candidates_limit"],
short_list: list[dict[Any, Any]] | None = None,
Expand All @@ -537,6 +584,7 @@ async def sa_rag_soc_code( # noqa: PLR0913
industry_descr (str): The description of the industry.
job_title (str, optional): The job title. Defaults to None.
job_description (str, optional): The job description. Defaults to None.
level_of_education (str): The level of education required for the job.
code_digits (int, optional): The number of digits in the generated
SOC code. Defaults to 4.
candidates_limit (int, optional): The maximum number of SOC code candidates
Expand All @@ -554,7 +602,9 @@ async def sa_rag_soc_code( # noqa: PLR0913

"""

def prep_call_dict(industry_descr, job_title, job_description, soc_codes):
def prep_call_dict(
industry_descr, job_title, job_description, level_of_education, soc_codes
):
# Helper function to prepare the call dictionary
is_job_title_present = job_title is None or job_title in {"", " "}
job_title = "Unknown" if is_job_title_present else job_title
Expand All @@ -571,6 +621,7 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes):
"industry_descr": industry_descr,
"job_title": job_title,
"job_description": job_description,
"level_of_education": level_of_education,
"soc_index": soc_codes,
}
return call_dict
Expand All @@ -588,6 +639,7 @@ def prep_call_dict(industry_descr, job_title, job_description, soc_codes):
industry_descr=industry_descr,
job_title=job_title,
job_description=job_description,
level_of_education=level_of_education,
soc_codes=soc_codes,
)

Expand Down
4 changes: 4 additions & 0 deletions src/occupational_classification_utils/llm/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
- Company's main activity: {industry_descr}
- Job Title: {job_title}
- Job Description: {job_description}
- Level of Education: {level_of_education}

===Relevant subset of UK SOC 2020===
{soc_index}
Expand Down Expand Up @@ -155,6 +156,7 @@
},
)


FIX_PARSING_PROMPT = PromptTemplate.from_template(
"""You are a meticulous assistant tasked with ensuring that
the output from a language model adheres strictly to the required JSON format.
Expand Down Expand Up @@ -196,6 +198,7 @@
- Company's main activity: {industry_descr}
- Job Title: {job_title}
- Job Description: {job_description}
- Level of Education: {level_of_education}

===Shortlist===
{soc_candidates}
Expand Down Expand Up @@ -228,6 +231,7 @@
- Company's main activity: {industry_descr}
- Job title: {job_title}
- Job description: {job_description}
- Level of Education: {level_of_education}
- Shortlist from previous model: {llm_output}
- Note: These are candidate occupational categories; do not mention codes or "SOC"
to the respondent.
Expand Down
82 changes: 55 additions & 27 deletions src/occupational_classification_utils/models/response_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,61 +277,89 @@ class SurveyAssistSocResponse(BaseModel):


class UnambiguousResponse(BaseModel):
"""Represents a response model for classification code assignment (two-step SOC).
"""Represents a response model for classification code assignment.

Same generic field names as SIC ``UnambiguousResponse`` for parity across schemes.
Attributes:
codable (bool): True only if enough information is provided to assign
an unambiguous single classification code, False otherwise.
class_code (Optional[str]): Full classification code (to the required number of digits)
assigned based on provided respondent's data. Must be present if codable=True,
must be None if codable=False.
class_descriptive (Optional[str]): Descriptive label of the classification category.
Must be present if codable=True, must be None if codable=False.
alt_candidates (list[RagCandidate]): Short list of possible classification codes with their
descriptive labels and estimated likelihoods.
reasoning (str): Step by step reasoning behind the classification selected.
"""

codable: bool = Field(
description=(
"True only if enough information is provided to decide an unambiguous "
"classification code, False otherwise."
)
description="True only if enough information is provided to decide an unambiguous "
"classification code, False otherwise."
)

class_code: str | None = Field(
default=None,
description=(
"Full classification code assigned from respondent data. "
"Present if codable=True, None if codable=False."
),
description="Full classification code (to the required number of digits) "
"assigned based on provided respondent's data. Must be present if codable=True, "
"must be None if codable=False.",
)

class_descriptive: str | None = Field(
default=None,
description=(
"Descriptive label for class_code. Present if codable=True, "
"None if codable=False."
),
description="Descriptive label of the classification category. "
"Must be present if codable=True, must be None if codable=False.",
)

alt_candidates: list[RagCandidate] = Field(
default_factory=list,
description="Short list of possible classification codes with likelihoods.",
min_length=1,
max_length=10,
description="Short list of possible classification codes with their "
"descriptive labels and estimated likelihoods.",
max_length=10, # Limit to less than 10 candidates
)

reasoning: str = Field(
description="Step by step reasoning behind the classification selected.",
min_length=50,
min_length=50, # Ensure detailed reasoning is provided
)

@field_validator("alt_candidates")
@classmethod
def validate_alt_candidates(cls, v: list[RagCandidate]) -> list[RagCandidate]:
"""Validate alternative candidate count."""
if not 1 <= len(v) <= MAX_ALT_CANDIDATES:
raise ValueError("alt_candidates must contain between 1 and 10 items.")
def validate_alt_candidates(cls, v):
Comment thread
ivyONS marked this conversation as resolved.
"""Validates the number of alternative candidates.

Ensures that the number of candidates is less or equal to the maximum allowed.

Args:
v (list): The list of alternative candidates.

Returns:
list: The validated list of candidates.

Raises:
ValueError: If the number of candidates is not within the allowed range.
"""
if not len(v) <= MAX_ALT_CANDIDATES:
raise ValueError("alt_candidates must contain no more than 10 items.")
return v


class OpenFollowUp(BaseModel):
"""Open-ended follow-up question when SOC cannot be assigned unambiguously."""
"""Represents a response model for open ended follow-up question.

Attributes:
followup (str): Question to ask user in order to collect
additional information to enable reliable classification assignment.
reasoning (str): Reasoning explaining how follow-up question will help
assign classification code.
"""

followup: str | None = Field(
description=(
"Question to collect additional information for reliable SOC assignment."
),
description="""Question to ask user in order to collect additional information
to enable reliable classification assignment.""",
default="",
)
reasoning: str = Field(
description="Reasoning explaining how the follow-up question helps classification.",
description="""Reasoning explaining how follow-up question will help
assign classification code.""",
default="",
)
Loading