-
Notifications
You must be signed in to change notification settings - Fork 1
SA617 - Cleaning ASHE strings and preparing SOC knowledgebase and SOC DIRECT LOOKUP #23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
abda90f
930bb75
1866384
f8b3b1f
f97a22c
d117afa
450cbca
9b23e16
0501860
b49c43d
1d831b6
4440585
438dbfc
5e4738d
ecd45be
633187a
c4aa376
73d9ade
4f7fe85
0e36e72
100edb4
efc0978
265dad2
877c771
36b0975
5cbf18d
cc7f148
a5d89a6
d2c90ed
ad0d103
cdf362c
cc7ddbb
11725d7
f47f50f
67d01f6
b239c58
6b33a45
95703a8
87fa56e
c2cceb7
964506a
25186e0
d4de101
7711abb
d467658
fc71991
5a34c96
d2d0596
f9e278f
a7b6af9
c5b941d
46ee733
0d8d1cf
421b7cd
24e4ce9
1728b2e
318d9cd
11b6717
22caba1
0cf9dcf
4e37980
b8bef9f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| KNOWLEDGE_BUCKET=<fill in> | ||
| SA_DEV=<fill in> | ||
| SA_SANDBOX=<fill in> |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,284 @@ | ||
| # pylint: disable=C0103, C0301 | ||
| """Exploratory work, not longer used for the data cleaning. | ||
| This is not a notebook. Run as a script. | ||
|
|
||
| Allows to use LLMS to correct spelling of provided text strings. | ||
| Recognises ABBREVIATIONS commonly used in SOC. | ||
|
|
||
| To execute, run: | ||
| `python notebooks/ashe_clean_2026_04.py ` | ||
|
|
||
| Diasbling line-too-long: commentary and discussion. | ||
| """ | ||
|
|
||
| import asyncio | ||
| import json | ||
| import math | ||
| import os | ||
|
|
||
| import dotenv | ||
| import pandas as pd | ||
| from occupational_classification.data_access.soc_data_access import ( | ||
| _combine_soc_index_job_title as combine_job_title, | ||
| ) | ||
|
|
||
| from occupational_classification_utils.llm.llm import ClassificationLLM | ||
|
|
||
| ### Constants ### | ||
| knowledge_bucket = dotenv.get_key(".env", "KNOWLEDGE_BUCKET") | ||
|
peter-spencer-ons marked this conversation as resolved.
|
||
|
|
||
| output_folder = "notebooks/soc_data" | ||
| file_prefix = "ashe_correct_spelling" | ||
| file_suffix = "_2026_05_19_sample4" | ||
|
|
||
| BATCH_SIZE = 10 | ||
|
|
||
| soc_coding_index_file = ( | ||
| f"{knowledge_bucket}soc2020volume2thecodingindexexcel03122025.xlsx" | ||
| ) | ||
|
|
||
| c_llm = ClassificationLLM("gemini-2.5-flash", verbose=False) | ||
|
|
||
| ### Access data ### | ||
| try: | ||
| data = pd.read_csv( | ||
| f"{knowledge_bucket}ASHE_classifai_soc_kb.csv", dtype={"label": str} | ||
| ) | ||
| print("Database loaded from storage.") | ||
|
|
||
|
|
||
| except FileNotFoundError: | ||
| print("File not found in the specified KNOWLEDGE_BUCKET.") | ||
| data = pd.read_csv( | ||
| f"{output_folder}/{file_prefix}{file_suffix}.csv", dtype={"label": str} | ||
| ) | ||
| print("Database loaded from local.") | ||
|
|
||
| try: | ||
| with open( | ||
| f"{output_folder}/{file_prefix}{file_suffix}.json", encoding="utf-8" | ||
| ) as file: | ||
| recent_batch_id = json.load(file)["completed_batches"] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. have you tried the interruption? I think there may be a duplicated batch
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't see any duplication after interrupting. Happy to discuss further |
||
| except FileNotFoundError: | ||
| recent_batch_id = 0 | ||
|
|
||
| print( | ||
| f"STARTING FROM {recent_batch_id} batch (row {recent_batch_id * BATCH_SIZE} out of {len(data)})." | ||
| ) | ||
|
|
||
|
|
||
| ### Read the data ### | ||
| def load_soc_index(filepath: str) -> pd.DataFrame: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is defined at like three different places |
||
| """Load SOC index. | ||
| Provides a list of over 32,000 titles associated with employment. | ||
|
|
||
| Args: | ||
| filepath (str): A path to the file containing SOC Index. | ||
|
|
||
| Returns: | ||
| pd.DataFrame: A DataFrame with transformed job titles. | ||
| """ | ||
| soc_index_df = pd.read_excel( | ||
| filepath, | ||
| sheet_name="SOC2020 coding index", | ||
| usecols=["SOC_2020", "INDEXOCC-natural_word_order", "ADD", "IND"], | ||
| dtype=str, | ||
| ) | ||
|
|
||
| soc_index_df.columns = [col.lower() for col in soc_index_df.columns] | ||
|
|
||
| soc_index_df = soc_index_df.rename( | ||
| columns={"indexocc-natural_word_order": "indexocc", "soc_2020": "code"} | ||
| ) | ||
|
|
||
| soc_index_df = soc_index_df[soc_index_df["code"] != "}}}}"] | ||
| soc_index_df = soc_index_df.dropna(subset=["code", "indexocc"]) | ||
| soc_index_df["title"] = soc_index_df.apply(combine_job_title, axis=1) | ||
| soc_index_df = soc_index_df[["code", "title"]] | ||
| soc_index_df["title"] = soc_index_df["title"].str.capitalize() | ||
|
|
||
| return soc_index_df | ||
|
|
||
|
|
||
| soc_list = load_soc_index(soc_coding_index_file) | ||
|
|
||
|
|
||
| def load_soc_abbreviations(filepath: str) -> pd.DataFrame: | ||
| """Load abbreviations used in SOC. | ||
|
peter-spencer-ons marked this conversation as resolved.
|
||
| Provides a list of abbreviations recognised in SOC titles. | ||
|
|
||
| Args: | ||
| filepath (str): A path to the file containing SOC Abbreviations. | ||
|
|
||
| Returns: | ||
| pd.DataFrame: A dictioary with abbreviations. | ||
| """ | ||
| soc_abbreviation = pd.read_excel( | ||
| filepath, | ||
| sheet_name="Abbreviations", | ||
| usecols=["Abbreviation", "Meaning"], | ||
| dtype=str, | ||
| header=5, | ||
| ) | ||
|
|
||
| return soc_abbreviation | ||
|
|
||
|
|
||
| soc_abb = load_soc_abbreviations(soc_coding_index_file) | ||
| soc_abb_dict = soc_abb.set_index("Abbreviation")["Meaning"].to_dict() | ||
|
|
||
| ### Remove duplicates ### | ||
| data["documents"] = data["documents"].str.strip() # Remove leading space | ||
| data = data.drop_duplicates(subset="documents", keep="last") # remove duplicates | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do I read this correctly that you drop duplicates ONLY before running the LLM spellcheck? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also shall we deduplocate on pair text-code? |
||
|
|
||
| ### Add column for correct spelling ### | ||
| if "corrected_spelling" not in data: | ||
| data["corrected_spelling"] = None | ||
| data["corrected_spelling"] = data["corrected_spelling"].astype(str) | ||
|
|
||
| ### Data manipulation ### | ||
| soc_list["title"] = soc_list[ | ||
| "title" | ||
| ].str.upper() # Upper case, as in the original dataset | ||
|
|
||
|
|
||
| ### Remove rows that come directly from SOC ### | ||
| titles_list = soc_list["title"] | ||
| titles_list = titles_list.to_list() | ||
|
|
||
| not_in_list = data[~data["documents"].isin(titles_list)].reset_index( | ||
| drop=True | ||
| ) # those are titles that didn't appear in soc_list | ||
| in_list = data[data["documents"].isin(titles_list)].reset_index( | ||
| drop=True | ||
| ) # those are titles that came from soc_list | ||
|
|
||
| # save the subset of titles that are repeated from SOC index | ||
| in_list.to_csv(f"{output_folder}/ashe_in_soc_index{file_suffix}.csv", index=False) | ||
| print("ASHE IN SOC SAVED.") | ||
|
|
||
|
|
||
| def batching(job_titles_column: pd.Series, batch_id: int): | ||
| """Takes next batch from the dataset of size 10. | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why async? |
||
|
|
||
| Args: | ||
| job_titles_column (pd.Series): A coulmn with job titles. | ||
| batch_id (int): number of the batch. | ||
|
|
||
| Returns: | ||
| job_titles_column: snippet of the data provided of size 10. | ||
| """ | ||
| batch = batch_id | ||
| start_id = batch * BATCH_SIZE | ||
| end_id = batch * BATCH_SIZE + BATCH_SIZE | ||
| return job_titles_column[start_id:end_id].copy() | ||
|
|
||
|
|
||
| async def spelling(jt: str, abb_dict: dict): | ||
| """Makes call to LLM to correct spelling mistakes. | ||
|
|
||
| Args: | ||
| jt (str): Job title to be corrected. | ||
| abb_dict (dict): A dictionary with abbreviations and their meaning {abbreviation: meaning}. | ||
|
|
||
| Returns: | ||
| corrected (str) | ||
| """ | ||
| corrected = await c_llm.clean_spelling( | ||
| misspelled_string=jt, abbreviation_dictionary=abb_dict | ||
| ) | ||
|
|
||
| return corrected.job_title_spelling.upper() | ||
|
|
||
|
|
||
| async def split_in_batches(df: pd.DataFrame): | ||
| """Takes the whole dataset, splits in batches and uses LLM to correct spelling of the job title. | ||
|
|
||
| Args: | ||
| df (pd.DataFrame): file. | ||
| """ | ||
| final_batch = math.ceil(len(df) / BATCH_SIZE) # get the amount of batches | ||
|
|
||
| for current_batch_id in range(recent_batch_id, final_batch): | ||
| print(f"batch {current_batch_id}") | ||
|
|
||
| current_batch = batching(df["documents"], current_batch_id) | ||
|
|
||
| tasks = [spelling(jt=jt, abb_dict=soc_abb_dict) for jt in current_batch] | ||
| responses = await asyncio.gather(*tasks) | ||
|
|
||
| for k, llm_response in enumerate(responses): | ||
| current_row = BATCH_SIZE * current_batch_id + k | ||
| df.loc[current_row, "corrected_spelling"] = llm_response | ||
|
|
||
| if ( | ||
| isinstance(df.loc[current_row, "corrected_spelling"], float) | ||
| or "JOB TITLE" in df.loc[current_row, "corrected_spelling"] | ||
| ): | ||
| df.loc[current_row, "corrected_spelling"] = df.loc[ | ||
| current_row, "documents" | ||
| ] | ||
|
|
||
| start_row = current_batch_id * BATCH_SIZE | ||
|
|
||
| rows_to_save = df.iloc[start_row : start_row + BATCH_SIZE][ | ||
| [ | ||
| "documents", | ||
| "label", | ||
| "corrected_spelling", | ||
| ] | ||
| ] | ||
|
|
||
| rows_to_save.to_csv( | ||
| f"{output_folder}/{file_prefix}{file_suffix}.csv", | ||
| mode="a", | ||
| header=False, | ||
| index=False, | ||
| ) | ||
|
|
||
| with open( | ||
| f"{output_folder}/{file_prefix}{file_suffix}.json", | ||
| "w", | ||
| encoding="utf8", | ||
| ) as json_file: | ||
| json.dump( | ||
| { | ||
| "completed_batches": current_batch_id, | ||
| }, | ||
| json_file, | ||
| ) | ||
|
|
||
| if current_batch_id + 1 == final_batch: | ||
|
|
||
| final_df = pd.read_csv( | ||
| f"{output_folder}/{file_prefix}{file_suffix}.csv", dtype={"label": str} | ||
| ) | ||
| final_df = final_df.drop_duplicates( | ||
| subset=["corrected_spelling", "label"], keep="last", ignore_index=True | ||
| ) # remove duplicates in the corrected spelling | ||
|
|
||
| final_df.to_csv( | ||
| f"{output_folder}/{file_prefix}{file_suffix}.csv", index=False | ||
| ) | ||
|
|
||
| print("FILE SAVED TO LOCAL") | ||
|
|
||
| # final_df.to_csv(f"{knowledge_bucket}wip_data/{file_prefix}{file_suffix}.csv", index=False) | ||
| # print("SAVED TO BUCKET") | ||
|
|
||
|
|
||
| if not os.path.exists(f"{output_folder}/{file_prefix}{file_suffix}.csv"): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. before writing, check records with NAN in 'corrected_spelling' |
||
| all_columns = [ | ||
| "documents", | ||
| "label", | ||
| "corrected_spelling", | ||
| ] | ||
| pd.DataFrame(columns=all_columns).to_csv( | ||
| f"{output_folder}/{file_prefix}{file_suffix}.csv", index=False | ||
| ) | ||
|
|
||
| s = not_in_list.head(1000) | ||
| asyncio.run(split_in_batches(s)) | ||
|
|
||
| # asyncio.run(split_in_batches(not_in_list)) | ||
| # asyncio.run(split_in_batches(in_list)) | ||
Uh oh!
There was an error while loading. Please reload this page.