Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .ipynb_checkpoints/program-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
84 changes: 84 additions & 0 deletions code/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
{
"Identification": {
"Product Name": "Product Name[:]*\\s*(.*)",
"Cat No.": "Cat No.[:]*\\s*(.*)",
"Synonyms": "Synonyms[:]*\\s*(.*)",
"Recommended Use": "Recommended Use[:]*\\s*(.*)",
"Uses advised against": "Uses advised against[:]*\\s*(.*)",
"Company": {
"Name": "Company[:]*\\s*(.*)",
"Address": "Company[:]*\\s*.*\\s*(.*\\s.*\\s.*\\d{5})",
"Telephone": "Tel[:]*\\s*(\\(\\d{3}\\) \\d{3}-\\d{4})"
}

},
"Emergency Contact": {
"Emergency Telephone Number": "Emergency Telephone Number[:]*\\s*(.*)"
},
"Hazard Identification": {
"Classification": "Classification[:]*\\s*(.*)",
"Signal Word": "Signal Word[:]*\\s*(.*)",
"Hazard Statements": "Hazard Statements[:]*\\s*(.*)",
"Precautionary Statements": {
"Prevention": "Prevention[:]*\\s*((?:.*\\n)+?)(?=(Eyes|Disposal|Hazards not otherwise classified|\\Z))",
"Response": "Response[:]*\\s*(.*)(?=Eyes|\\nEyes)",
"Eyes": "Eyes[:]*\\s*(.*)",
"Disposal": "Disposal[:]*\\s*(.*)",
"Hazards not otherwise classified (HNOC)": "Hazards not otherwise classified[:]*\\s*(.*)"
}
},
"Composition/Information on Ingredients": {
"Component": "Component[:]*\\s*(.*)",
"CAS No": "CAS No[:]*\\s*(.*)",
"Weight %": "Weight %[:]*\\s*(.*)"
},
"First Aid Measures": {
"General Advice": "General Advice[:]*\\s*(.*)",
"Eye Contact": "Eye Contact[:]*\\s*(.*)",
"Skin Contact": "Skin Contact[:]*\\s*(.*)",
"Inhalation": "Inhalation[:]*\\s*(.*)",
"Ingestion": "Ingestion[:]*\\s*(.*)"
},
"Fire Fighting Measures": {
"Flash Point": "Flash Point[:]*\\s*(.*)",
"Autoignition Temperature": "Autoignition Temperature[:]*\\s*(.*)",
"Explosion Limits": "Explosion Limits[:]*\\s*(.*)"
},
"Handling and Storage": {
"Handling": "Handling[:]*\\s*(.*)",
"Storage": "Storage[:]*\\s*(.*)"
},
"Exposure Controls": {
"Engineering Measures": "Engineering Measures[:]*\\s*(.*)",
"Eye/face Protection": "Eye/face Protection[:]*\\s*(.*)",
"Skin Protection": "Skin Protection[:]*\\s*(.*)",
"Respiratory Protection": "Respiratory Protection[:]*\\s*(.*)"
},
"Physical and Chemical Properties": {
"Physical State": "Physical State[:]*\\s*(.*)",
"Appearance": "Appearance[:]*\\s*(.*)",
"Odor": "Odor[:]*\\s*(.*)",
"pH": "pH[:]*\\s*(.*)",
"Melting Point/Range": "Melting Point/Range[:]*\\s*(.*)",
"Boiling Point/Range": "Boiling Point/Range[:]*\\s*(.*)",
"Vapor Pressure": "Vapor Pressure[:]*\\s*(.*)",
"Solubility": "Solubility[:]*\\s*(.*)"
},
"Stability and Reactivity": {
"Stability": "Stability[:]*\\s*(.*)",
"Incompatible Materials": "Incompatible Materials[:]*\\s*(.*)"
},
"Toxicological Information": {
"Acute Toxicity": "Acute Toxicity[:]*\\s*(.*)",
"Chronic Effects": "Chronic Effects[:]*\\s*(.*)"
},
"Ecological Information": {
"Ecotoxicity": "Ecotoxicity[:]*\\s*(.*)"
},
"Disposal Considerations": {
"Waste Disposal Methods": "Waste Disposal Methods[:]*\\s*(.*)"
},
"Transport Information": {
"DOT": "DOT[:]*\\s*(.*)"
}
}
114 changes: 114 additions & 0 deletions code/program.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 21,
"id": "106932cf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed edta--ph--lt.pdf and saved JSON data to edta--ph--lt.json\n"
]
}
],
"source": [
"import pdfplumber\n",
"import json\n",
"import re\n",
"import os\n",
"\n",
"def extract_text_from_pdf_plumber(pdf_path):\n",
" with pdfplumber.open(pdf_path) as pdf:\n",
" text = \"\"\n",
" for page in pdf.pages:\n",
" page_text = page.extract_text()\n",
" text += page_text + \"\\n\"\n",
" return text\n",
"\n",
"def load_patterns(config_path):\n",
" with open(config_path, 'r') as config_file:\n",
" return json.load(config_file)\n",
"\n",
"def parse_sds_text(text, patterns):\n",
" sds_dict = {}\n",
"\n",
" for section, field_patterns in patterns.items():\n",
" sds_dict[section] = {}\n",
"\n",
" for field, pattern in field_patterns.items():\n",
" if isinstance(pattern, dict):\n",
" # Handle nested dictionary (e.g., Company information)\n",
" sds_dict[section][field] = {}\n",
" for sub_field, sub_pattern in pattern.items():\n",
" match = re.search(sub_pattern, text)\n",
" if match:\n",
" sds_dict[section][field][sub_field] = match.group(1).strip()\n",
" else:\n",
" # Handle regular pattern matching\n",
" match = re.search(pattern, text)\n",
" if match:\n",
" sds_dict[section][field] = match.group(1).strip()\n",
"\n",
" return sds_dict\n",
"\n",
"\n",
"def process_multiple_pdfs(pdf_folder, config_path, output_folder):\n",
" # Load regex patterns from config\n",
" section_patterns = load_patterns(config_path)\n",
" \n",
" # Ensure output folder exists\n",
" if not os.path.exists(output_folder):\n",
" os.makedirs(output_folder)\n",
" \n",
" # Loop through each PDF file in the folder\n",
" for pdf_filename in os.listdir(pdf_folder):\n",
" if pdf_filename.endswith(\".pdf\"):\n",
" pdf_path = os.path.join(pdf_folder, pdf_filename)\n",
" \n",
" # Extract text from the PDF\n",
" extracted_text = extract_text_from_pdf_plumber(pdf_path)\n",
" \n",
" # Parse the extracted text using the patterns\n",
" sds_data = parse_sds_text(extracted_text, section_patterns)\n",
" \n",
" # Save the parsed data as JSON\n",
" json_filename = os.path.splitext(pdf_filename)[0] + \".json\"\n",
" json_path = os.path.join(output_folder, json_filename)\n",
" with open(json_path, 'w') as json_file:\n",
" json.dump(sds_data, json_file, indent=4)\n",
" \n",
" print(f\"Processed {pdf_filename} and saved JSON data to {json_filename}\")\n",
"\n",
"pdf_folder = \"/home/sabrina/Documents/Project/data/\"\n",
"config_path = \"/home/sabrina/Documents/Project/config.json\"\n",
"output_folder = \"/home/sabrina/Documents/Project/json_output/\"\n",
"\n",
"process_multiple_pdfs(pdf_folder, config_path, output_folder)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.15"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
80 changes: 80 additions & 0 deletions output/acetone-acs-l (1).json
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
{
"Identification": {
"Product Name": "Acetone",
"Cat No.": ": A9-4; A9-20; A9-200; A11-1; A11-4; A11-20; A11-200; A11S-4; A13-20;",
"Synonyms": "2-Propanone; Dimethyl ketone; (Certified ACS, HPLC, OPTIMA, Histological,",
"Recommended Use": "Laboratory chemicals.",
"Uses advised against": "Food, drug, pesticide or biocidal product use.",
"Company": {
"Name": "Fisher Scientific Company",
"Address": "One Reagent Lane\nFair Lawn, NJ 07410",
"Telephone": "(201) 796-7100"
}
},
"Emergency Contact": {
"Emergency Telephone Number": "CHEMTREC\u00d2, Inside the USA: 800-424-9300"
},
"Hazard Identification": {
"Classification": "______________________________________________________________________________________________",
"Signal Word": "Danger",
"Hazard Statements": "Highly flammable liquid and vapor",
"Precautionary Statements": {
"Prevention": "Wash face, hands and any exposed skin thoroughly after handling\nDo not breathe dust/fume/gas/mist/vapors/spray\nUse only outdoors or in a well-ventilated area\nKeep away from heat/sparks/open flames/hot surfaces. - No smoking\nKeep container tightly closed\nGround/bond container and receiving equipment\nUse explosion-proof electrical/ventilating/lighting equipment\nUse only non-sparking tools\nTake precautionary measures against static discharge\nWear protective gloves/protective clothing/eye protection/face protection\nKeep cool\nResponse\nGet medical attention/advice if you feel unwell\nInhalation\nIF INHALED: Remove victim to fresh air and keep at rest in a position comfortable for breathing\nCall a POISON CENTER or doctor/physician if you feel unwell\nSkin\nIF ON SKIN (or hair): Take off immediately all contaminated clothing. Rinse skin with water/shower",
"Eyes": "IF IN EYES: Rinse cautiously with water for several minutes. Remove contact lenses, if present and easy to do. Continue rinsing",
"Disposal": "______________________________________________________________________________________________",
"Hazards not otherwise classified (HNOC)": "(HNOC)"
}
},
"Composition/Information on Ingredients": {
"Component": "CAS No Weight %",
"CAS No": "67-64-1",
"Weight %": "Acetone 67-64-1 >95"
},
"First Aid Measures": {
"General Advice": "If symptoms persist, call a physician.",
"Eye Contact": "Rinse immediately with plenty of water, also under the eyelids, for at least 15 minutes. Get",
"Skin Contact": "Wash off immediately with plenty of water for at least 15 minutes. If skin irritation persists,",
"Inhalation": "IF INHALED: Remove victim to fresh air and keep at rest in a position comfortable for breathing",
"Ingestion": "Clean mouth with water and drink afterwards plenty of water."
},
"Fire Fighting Measures": {
"Flash Point": "-20 \u00b0C / -4 \u00b0F",
"Autoignition Temperature": "465 \u00b0C / 869 \u00b0F",
"Explosion Limits": "Upper 12.8 vol %"
},
"Handling and Storage": {
"Handling": "and storage",
"Storage": "Store in a well-ventilated place. Keep container tightly closed"
},
"Exposure Controls": {
"Engineering Measures": "Ensure adequate ventilation, especially in confined areas. Ensure that eyewash stations",
"Eye/face Protection": "Wear appropriate protective eyeglasses or chemical safety goggles as described by",
"Respiratory Protection": "Follow the OSHA respirator regulations found in 29 CFR 1910.134 or European Standard"
},
"Physical and Chemical Properties": {
"Physical State": "Liquid",
"Appearance": "Colorless",
"Odor": "sweet",
"pH": "7",
"Melting Point/Range": "-95 \u00b0C / -139 \u00b0F",
"Boiling Point/Range": "56 \u00b0C / 132.8 \u00b0F",
"Vapor Pressure": "247 mbar @ 20 \u00b0C",
"Solubility": "Soluble in water"
},
"Stability and Reactivity": {
"Stability": "and reactivity",
"Incompatible Materials": ". Strong oxidizing agents."
},
"Toxicological Information": {
"Acute Toxicity": "Product Information"
},
"Ecological Information": {
"Ecotoxicity": "."
},
"Disposal Considerations": {
"Waste Disposal Methods": "Chemical waste generators must determine whether a discarded chemical is classified as a"
},
"Transport Information": {
"DOT": "UN-No UN1090"
}
}
80 changes: 80 additions & 0 deletions output/acetone-acs-l.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
{
"Identification": {
"Product Name": "Acetone",
"Cat No.": ": A9-4; A9-20; A9-200; A11-1; A11-4; A11-20; A11-200; A11S-4; A13-20;",
"Synonyms": "2-Propanone; Dimethyl ketone; (Certified ACS, HPLC, OPTIMA, Histological,",
"Recommended Use": "Laboratory chemicals.",
"Uses advised against": "Food, drug, pesticide or biocidal product use.",
"Company": {
"Name": "Fisher Scientific Company",
"Address": "One Reagent Lane\nFair Lawn, NJ 07410",
"Telephone": "(201) 796-7100"
}
},
"Emergency Contact": {
"Emergency Telephone Number": "CHEMTREC\u00d2, Inside the USA: 800-424-9300"
},
"Hazard Identification": {
"Classification": "______________________________________________________________________________________________",
"Signal Word": "Danger",
"Hazard Statements": "Highly flammable liquid and vapor",
"Precautionary Statements": {
"Prevention": "Wash face, hands and any exposed skin thoroughly after handling\nDo not breathe dust/fume/gas/mist/vapors/spray\nUse only outdoors or in a well-ventilated area\nKeep away from heat/sparks/open flames/hot surfaces. - No smoking\nKeep container tightly closed\nGround/bond container and receiving equipment\nUse explosion-proof electrical/ventilating/lighting equipment\nUse only non-sparking tools\nTake precautionary measures against static discharge\nWear protective gloves/protective clothing/eye protection/face protection\nKeep cool\nResponse\nGet medical attention/advice if you feel unwell\nInhalation\nIF INHALED: Remove victim to fresh air and keep at rest in a position comfortable for breathing\nCall a POISON CENTER or doctor/physician if you feel unwell\nSkin\nIF ON SKIN (or hair): Take off immediately all contaminated clothing. Rinse skin with water/shower",
"Eyes": "IF IN EYES: Rinse cautiously with water for several minutes. Remove contact lenses, if present and easy to do. Continue rinsing",
"Disposal": "______________________________________________________________________________________________",
"Hazards not otherwise classified (HNOC)": "(HNOC)"
}
},
"Composition/Information on Ingredients": {
"Component": "CAS No Weight %",
"CAS No": "67-64-1",
"Weight %": "Acetone 67-64-1 >95"
},
"First Aid Measures": {
"General Advice": "If symptoms persist, call a physician.",
"Eye Contact": "Rinse immediately with plenty of water, also under the eyelids, for at least 15 minutes. Get",
"Skin Contact": "Wash off immediately with plenty of water for at least 15 minutes. If skin irritation persists,",
"Inhalation": "IF INHALED: Remove victim to fresh air and keep at rest in a position comfortable for breathing",
"Ingestion": "Clean mouth with water and drink afterwards plenty of water."
},
"Fire Fighting Measures": {
"Flash Point": "-20 \u00b0C / -4 \u00b0F",
"Autoignition Temperature": "465 \u00b0C / 869 \u00b0F",
"Explosion Limits": "Upper 12.8 vol %"
},
"Handling and Storage": {
"Handling": "and storage",
"Storage": "Store in a well-ventilated place. Keep container tightly closed"
},
"Exposure Controls": {
"Engineering Measures": "Ensure adequate ventilation, especially in confined areas. Ensure that eyewash stations",
"Eye/face Protection": "Wear appropriate protective eyeglasses or chemical safety goggles as described by",
"Respiratory Protection": "Follow the OSHA respirator regulations found in 29 CFR 1910.134 or European Standard"
},
"Physical and Chemical Properties": {
"Physical State": "Liquid",
"Appearance": "Colorless",
"Odor": "sweet",
"pH": "7",
"Melting Point/Range": "-95 \u00b0C / -139 \u00b0F",
"Boiling Point/Range": "56 \u00b0C / 132.8 \u00b0F",
"Vapor Pressure": "247 mbar @ 20 \u00b0C",
"Solubility": "Soluble in water"
},
"Stability and Reactivity": {
"Stability": "and reactivity",
"Incompatible Materials": ". Strong oxidizing agents."
},
"Toxicological Information": {
"Acute Toxicity": "Product Information"
},
"Ecological Information": {
"Ecotoxicity": "."
},
"Disposal Considerations": {
"Waste Disposal Methods": "Chemical waste generators must determine whether a discarded chemical is classified as a"
},
"Transport Information": {
"DOT": "UN-No UN1090"
}
}
Loading