LexExtract/script.py at main · VictorBaumgartner/LexExtract · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import docx
import fitz  # PyMuPDF
import pytesseract
import pandas as pd
import streamlit as st
from PIL import Image

# Assure que le dossier processed_files existe
SAVE_DIR = "processed_files"
os.makedirs(SAVE_DIR, exist_ok=True)

# Configuration du chemin Tesseract (Windows)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # Modifie selon ton installation

# CSS pour le fond gradient animé avec plus de couleurs
page_bg = """
<style>
    body, .stApp {
        background: linear-gradient(45deg, #32CD32, #FF4500, #FFD700, #FF4500, #32CD32);
        background-size: 400% 400%;
        animation: gradientBG 8s ease infinite;
        height: 100vh;
        width: 100vw;
        position: fixed;
        overflow: hidden;
    }
    @keyframes gradientBG {
        0% { background-position: 0% 50%; }
        50% { background-position: 100% 50%; }
        100% { background-position: 0% 50%; }
    }
</style>
"""
st.markdown(page_bg, unsafe_allow_html=True)

st.title("📜 LexExtract - Extraction de données juridiques")

def extract_text_from_pdf(pdf_file):
    """ Extrait le texte d'un fichier PDF. """
    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    text = "\n".join([page.get_text("text") for page in doc])
    return text

def extract_text_from_image(image_file):
    """ Extrait le texte d'une image (JPG, PNG) en utilisant Tesseract OCR. """
    image = Image.open(image_file)
    text = pytesseract.image_to_string(image, lang="eng+fra")  # Support anglais + français
    return text

def save_to_docx(text, filename):
    """ Sauvegarde le texte extrait dans un fichier DOCX. """
    doc = docx.Document()
    doc.add_paragraph(text)
    filepath = os.path.join(SAVE_DIR, f"{filename}.docx")
    doc.save(filepath)
    return filepath

def save_to_excel(data, filename):
    """ Sauvegarde les données extraites dans un fichier Excel. """
    df = pd.DataFrame(data, columns=['Informations'])
    filepath = os.path.join(SAVE_DIR, f"{filename}.xlsx")
    df.to_excel(filepath, index=False)
    return filepath

# Interface Streamlit
uploaded_file = st.file_uploader("📂 Upload un fichier", type=["txt", "pdf", "docx", "jpg", "jpeg", "png"])

if uploaded_file is not None:
    file_extension = uploaded_file.name.split(".")[-1]

    if file_extension in ["pdf"]:
        extracted_text = extract_text_from_pdf(uploaded_file)
    elif file_extension in ["jpg", "jpeg", "png"]:
        extracted_text = extract_text_from_image(uploaded_file)
    else:
        try:
            extracted_text = uploaded_file.read().decode("utf-8")
        except UnicodeDecodeError:
            extracted_text = uploaded_file.read().decode("latin-1")  # Alternative pour éviter l'erreur

    st.text_area("📖 Texte extrait", extracted_text, height=300)

    # Sauvegarde des fichiers
    base_filename = os.path.splitext(uploaded_file.name)[0]
    docx_path = save_to_docx(extracted_text, base_filename)
    excel_path = save_to_excel([[extracted_text]], base_filename)

    st.success(f"✅ Fichier DOCX sauvegardé : {docx_path}")
    st.success(f"✅ Fichier Excel sauvegardé : {excel_path}")

    # Boutons de téléchargement
    with open(docx_path, "rb") as f:
        st.download_button("📥 Télécharger DOCX", f, file_name=os.path.basename(docx_path))

    with open(excel_path, "rb") as f:
        st.download_button("📥 Télécharger Excel", f, file_name=os.path.basename(excel_path))