HardpointConsulting · Ignatius-Nobel · May 7, 2024 · May 7, 2024 · May 7, 2024
diff --git a/Ignatius Nobel Chittilappilly/Internship/Data Extracted.xlsx b/Ignatius Nobel Chittilappilly/Internship/Data Extracted.xlsx
diff --git a/Ignatius Nobel Chittilappilly/Internship/Internship at Hardpoint Consulting.docx b/Ignatius Nobel Chittilappilly/Internship/Internship at Hardpoint Consulting.docx
diff --git a/Ignatius Nobel Chittilappilly/Internship/__pycache__/pypdf.cpython-310.pyc b/Ignatius Nobel Chittilappilly/Internship/__pycache__/pypdf.cpython-310.pyc
diff --git a/Ignatius Nobel Chittilappilly/Internship/assitant for visually impaired.pdf b/Ignatius Nobel Chittilappilly/Internship/assitant for visually impaired.pdf
diff --git a/Ignatius Nobel Chittilappilly/Internship/extractor.py b/Ignatius Nobel Chittilappilly/Internship/extractor.py
@@ -0,0 +1,51 @@
+from PyPDF2 import PdfReader
+from transformers import pipeline
+
+# Finding Author
+def pdf_author(path):
+    reader = PdfReader(path)
+    text = ""
+    page = reader.pages[0]
+    text = page.extract_text()
+    try:
+        text_path = path.replace(".pdf","")
+        with open(text_path, "w", encoding='utf-8') as file:
+            file.write(text)
+            print("File created successfully.")
+    except Exception as e:
+            print("An error occurred:", str(e)) 
+
+    ner = pipeline("ner", grouped_entities=True)
+    t = ner(text)
+    auth = []
+    for i in range(len(t)):
+        if t[i]['entity_group'] == 'PER':
+            if(len(t[i]['word'])<=1):
+                auth.append(t[i]['word'].replace(",","").replace("\'","").replace(" ","").replace("  ","") + ".")  #removing unwanted characters and white-spaces
+            else:
+                auth.append(t[i]['word'] + ",")
+    auth_name = " ".join(auth)
+    return auth_name
+# Summarization 
+def pdf_summary(path):
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+
+    # Load text from a file
+    file_path = path.replace(".pdf","")
+    with open(file_path, "r", encoding="utf-8") as file:
+        text = file.read()
+
+    # Split the text into smaller chunks
+    max_chunk_length = 512  # Maximum sequence length supported by the model
+    chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
+
+    # Summarize each chunk
+    summaries = []
+    for chunk in chunks:
+        summary = summarizer(chunk, min_length=10, max_length=30)[0]['summary_text']
+        summaries.append(summary)
+
+    # Combine the summaries into a single summary
+    final_summary = ' '.join(summaries)
+    return final_summary
+
diff --git a/Ignatius Nobel Chittilappilly/Internship/main.py b/Ignatius Nobel Chittilappilly/Internship/main.py
@@ -0,0 +1,63 @@
+import streamlit as st
+from extractor import *
+import re
+
+st.set_page_config(
+        page_title="PDF Extractor",
+        page_icon="🗃"
+)
+st.title("PDF Extractor 📰")
+st.subheader("Upload your files to extract data!!!")
+
+def entry_db(title,year, journal, author,types, summary):
+    import openpyxl
+
+    # Load the existing workbook
+    wb = openpyxl.load_workbook('Data Extracted.xlsx')
+
+    # Select the active sheet
+    sheet = wb.active
+
+    # Determine the last row with data in column A
+    last_row = sheet.max_row + 1
+
+    # Data to be added
+
+    # Add data to the last row
+    sheet.cell(row=last_row, column=1).value = last_row-2
+    sheet.cell(row=last_row, column=2).value = title
+    sheet.cell(row=last_row, column=3).value = year
+    sheet.cell(row=last_row, column=4).value = journal
+    sheet.cell(row=last_row, column=5).value = author
+    sheet.cell(row=last_row, column=6).value = summary
+    sheet.cell(row=last_row, column=7).value = types
+
+    # Save the workbook
+    wb.save('Data Extracted.xlsx')
+
+def save_uploaded_file(uploaded_file):
+    with open(f"docs\\{uploaded_file.name}", "wb") as file:
+        file.write(uploaded_file.getbuffer())
+    return f"docs\\{uploaded_file.name}"
+
+uploaded_file = st.file_uploader("Upload your file as .pdf",type=["pdf"])
+if uploaded_file is not None:
+    # save_path = os.path.join("docs\\", uploaded_file.name)
+    saved_path = save_uploaded_file(uploaded_file)
+    st.success(f"File saved")
+    reader = PdfReader(saved_path)
+    meta = reader.metadata
+    doc_title = st.text_input("Title",meta.title)
+    year = meta.creation_date
+    doc_year=st.text_input("Year",year.year)
+    doc = re.sub('[^A-Za-z0-9() ]+', '', meta.subject)
+    doc_journal = st.text_input("Journal",doc)
+    type = st.text_input("Type",)
+    doc_author = pdf_author(saved_path)
+    doc_summary = pdf_summary(saved_path)
+    auth = st.text_input("Author",doc_author)
+    summary = st.text_area("Document Summary",doc_summary)
+    if st.button("Submit"):
+        entry_db(doc_title,doc_year, doc_journal, auth,type, summary)
+
+