CodeDontBlow · henrySilverIX · Nov 12, 2025 · Nov 14, 2025 · Nov 14, 2025 · Nov 21, 2025
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,9 @@ ncm_text.json
 #chromadb
 #csv tipi
 ncm_chapter_85.csv
+app/libs/extract_pdf/extract_new/vectorstore2_invoice
+vectorstore2_invoice
+/vectorstore2_invoice
 
 
 #collection chromadb
@@ -200,6 +203,7 @@ env-embedding
 INVOICE TECSYS.pdf
 exemplo_pdf_entrada.pdf
 Pedido compras_Mouser.pdf
+Invoice Mouser_.pdf
 # mkdocs documentation
 /site
 

diff --git a/app/libs/extract_pdf/extract_new/embeddings.py b/app/libs/extract_pdf/extract_new/embeddings.py
@@ -0,0 +1,121 @@
+import uuid
+import requests
+from typing import List 
+from langchain_community.document_loaders.pdf import PyPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.embeddings.base import Embeddings 
+from langchain_community.vectorstores import Chroma 
+import streamlit as st 
+import pandas as pd
+from app.log.logger import logger
+
+VECTORSTORE_PATH = "app/libs/extract_pdf/extract_new/vectorstore2_invoice"
+EMBEDDING_MODEL = "nomic-embed-text"
+OLLAMA_URL = "http://localhost:11434"
+TIMEOUT = 120
+PDF_PATH = "./app/libs/extract_pdf/extract_new/INVOICE TECSYS.pdf"
+
+
+class CustomOllamaEmbeddingFunction(Embeddings):
+    def __init__(self, model_name: str, url: str = OLLAMA_URL, timeout: int = TIMEOUT):
+        self.model_name = model_name
+        self.url = url
+        self.timeout = timeout
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        return [self._request_embedding(text) for text in texts]
+
+    def embed_query(self, text: str) -> List[float]:
+        return self._request_embedding(text)
+
+    def _request_embedding(self, text: str) -> List[float]:
+        try:
+            response = requests.post(
+                f"{self.url}/api/embeddings",
+                json={"model": self.model_name, "prompt": text},
+                timeout=self.timeout
+            )
+            response.raise_for_status()
+            return response.json().get("embedding", [])
+        except Exception as e:
+            logger.error(f"[EMBEDDING] Error: {e}")
+            return []
+
+
+class InvoiceChromaDBManager:
+    def __init__(self):
+        logger.info("[CHROMADB-PDF] Initializing")
+        self.embedding_function = CustomOllamaEmbeddingFunction(EMBEDDING_MODEL)
+        self.vectorstore = self._load_or_create_vectorstore()
+        self.retriever = self.vectorstore.as_retriever(
+            search_type="similarity",
+            search_kwargs={"k": 20}
+        )
+        logger.info("[CHROMADB-PDF] Ready")
+
+    def _load_or_create_vectorstore(self):
+        try:
+            store = Chroma(
+                persist_directory=VECTORSTORE_PATH,
+                embedding_function=self.embedding_function
+            )
+            if store._collection.count() > 0:
+                logger.info(f"[CHROMADB-PDF] Loaded with {store._collection.count()} documents")
+                return store
+        except Exception as e:
+            logger.warning(f"[CHROMADB-PDF] Failed to load: {e}")
+
+        return self._create_vectorstore_from_pdf(PDF_PATH)
+
+    def _create_vectorstore_from_pdf(self, path: str):
+        logger.info("[CHROMADB-PDF] Creating new store")
+
+        pages = PyPDFLoader(path).load()
+        logger.info(f"[CHROMADB-PDF] Loaded {len(pages)} pages")
+
+        splitter = RecursiveCharacterTextSplitter(
+            chunk_size=150,
+            chunk_overlap=50,
+            length_function=len,
+            separators=["\n", "PN:", " - ", ":", "  ", " ", ""]
+        )
+        chunks = splitter.split_documents(pages)
+
+        logger.info(f"[CHROMADB-PDF] Produced {len(chunks)} chunks")
+
+        unique = {}
+        for doc in chunks:
+            uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content))
+            if uid not in unique:
+                unique[uid] = doc
+
+        logger.info(f"[CHROMADB-PDF] Deduplicated to {len(unique)} chunks")
+
+        store = Chroma.from_documents(
+            documents=list(unique.values()),
+            ids=list(unique.keys()),
+            embedding=self.embedding_function,
+            persist_directory=VECTORSTORE_PATH
+        )
+        store.persist()
+        logger.info("[CHROMADB-PDF] Store created")
+
+        return store
+
+    def search_parts(self, query: str = "PN:") -> List[str]:
+        try:
+            logger.info(f"[CHROMADB-PDF] Searching: {query}")
+            docs = self.retriever.invoke(query)
+            return [doc.page_content for doc in docs]
+        except Exception as e:
+            logger.error(f"[CHROMADB-PDF] Search error: {e}")
+            return []
+
+
+invoice_chroma_manager = InvoiceChromaDBManager()
+
+if __name__ == "__main__":
+    results = invoice_chroma_manager.search_parts("PN:")
+    print("Relevant Parts Found:")
+    for i, part in enumerate(results, 1):
+        print(f"{i}. {part}")
diff --git a/app/libs/extract_pdf/extract_new/extract_pn_from_text.py b/app/libs/extract_pdf/extract_new/extract_pn_from_text.py
@@ -0,0 +1,200 @@
+import re
+from typing import List
+
+
+def extract_part_numbers(text: str) -> List[str]:
+    """
+    Extrai possíveis Part Numbers (PNs) de um texto cru.
+
+    Estratégia (v3.0):
+    - Regex bem genérico para pegar tokens "candidatos" (letras/dígitos + [-_.]).
+    - Vários filtros de EXCLUSÃO (telefone, CEP, datas, HS Code, medidas, pacotes SMD, etc.).
+    - Regras específicas para alguns fornecedores (Mouser, Tecsys).
+    - Só depois aplicamos regras de ACEITAÇÃO para decidir se um token é PN.
+    - No final, removemos duplicados mantendo a ordem original.
+    """
+
+    # Normalizamos uma cópia em maiúsculas para detecção de fornecedor
+    upper_text = text.upper()
+
+    is_mouser = (
+        "MOUSER ELECTRONICS" in upper_text
+        or "WWW.MOUSER.COM" in upper_text
+        or "MOUSER BRASIL" in upper_text
+    )
+    is_tecsys = (
+        "TECSYS DO BRASIL" in upper_text
+        or "XWORKSOLUTIONS.COM" in upper_text
+    )
+
+    # Regex "genérica" de candidatos a PN:
+    #  - começa com letra/dígito
+    #  - depois 3+ caracteres de [letra, dígito, -, _, .]
+    pn_regex = r"\b([A-Za-z0-9][A-Za-z0-9\-_.]{3,})\b"
+    candidates = re.findall(pn_regex, text)
+
+    # Listas de apoio
+    generic_blacklist = {
+        # termos de invoice / administrativos
+        "EX-WORKS", "WORKS", "INVOICE", "TOTAL", "SUBTOTAL",
+        "INFORMATION", "DECLARE", "CONTAINED", "CORRECT",
+        "ITEM", "DESC", "MFR", "SCHEDULE", "LOT", "COO", "PAGE",
+        "CUSTOMER", "NUMBER", "PAYMENT", "TERMS", "SHIP", "INCOTERM",
+        "INVOICE#", "INVOICENO",
+
+        # pesos / medidas gerais
+        "N.WT", "G.WT", "NWT", "GWT",
+
+        # termos administrativos mais chatos
+        "ANTI-CORRUPTION", "END-USER", "ENDUSER",
+        "ENVIE-NOS", "ENVIENOS", "END-USE",
+
+        # descrições genéricas
+        "RETURN", "LOSS", "GAIN", "MOUNT", "SOURCE",
+        "HARDWARE", "EMBEDDED", "COMPUTER", "LINUX",
+        "SINGLE", "BOARD",
+
+        # coisas de CPU / SoC
+        "ARM926J",
+    }
+
+    # pacotes/encapsulamentos comuns (não queremos como PN)
+    package_tokens = {
+        "DFN-12", "DFN12",
+        "SMA", "DO-214AC", "SOT-23", "SOT-223", "SOT-523", "SOT-723",
+        "QFN", "BGA", "SOIC",
+        "TO-220", "TO-252-3", "TO-252", "TO252", "TO-92", "LGA", "LCC",
+    }
+
+    # resultado final
+    final: List[str] = []
+
+    for tok in candidates:
+        t = tok.strip()
+        if not t:
+            continue
+
+        upper = t.upper()
+
+        # ---------------------------
+        # 1) EXCLUSÕES UNIVERSAIS
+        # ---------------------------
+
+        # a) telefones: 0000-0000, 12-3456-7890, etc.
+        if re.match(r"^\d{4}-\d{4}$", t):
+            continue
+        if re.match(r"^\d{2,3}-\d{3,4}-\d{3,4}$", t):
+            continue
+
+        # b) CEP: 12345-678 ou 12.345-678
+        if re.match(r"^\d{5}-\d{3}$", t):
+            continue
+        if re.match(r"^\d{2}\.\d{3}-\d{3}$", t):
+            continue
+
+        # c) datas / períodos: 3-2025, 03-2025, 3/5/2025 etc.
+        if re.match(r"^\d{1,2}[-/]\d{1,2}[-/]\d{2,4}$", t):
+            continue
+        if re.match(r"^\d{1,2}[-/]\d{1,4}$", t):
+            continue
+
+        # d) só letras → não é PN
+        if t.isalpha():
+            continue
+
+        # e) faixa numérica tipo 1700-2700 (frequência)
+        if re.match(r"^\d{3,4}-\d{3,4}$", t):
+            continue
+
+        # f) domínio/host sem dígitos (ex: "xworksolutions.com")
+        if "." in t and not any(c.isdigit() for c in t):
+            continue
+
+        # g) palavras compostas por letras e hífen, mas sem dígitos (UP-RIGHT, ITEM-CLIENTE)
+        if re.match(r"^[A-Za-z\-]+$", t) and not any(c.isdigit() for c in t):
+            continue
+
+        # h) blacklist forte (palavras que NUNCA queremos como PN)
+        if upper in generic_blacklist:
+            continue
+
+        # i) tokens de pacote/encapsulamento (SOT-23, DFN-12, etc.)
+        if upper in package_tokens:
+            continue
+
+        # j) terminações técnicas de unidade (medida, capacitância, etc.)
+        if upper.endswith(("MM", "CM", "IN")):
+            continue
+        if upper.endswith(("UF", "NF", "PF", "MF")):
+            continue
+        if upper.endswith(("VAC", "VDC", "AC", "X1", "X2")):
+            continue
+        if upper.endswith(("HZ", "KHZ", "MHZ", "GHZ")):
+            continue
+
+        # k) tamanhos tipo 12.5X25 ou 6X12
+        if re.match(r"^\d+(\.\d+)?x\d+(\.\d+)?$", t, re.IGNORECASE):
+            continue
+
+        # ---------------------------
+        # 2) REGRAS ESPECÍFICAS POR FORNECEDOR
+        # ---------------------------
+
+        if is_mouser:
+            # Códigos internos Mouser tipo 61-1520598 (prefixo numérico + hífen + dígitos)
+            if re.match(r"^\d{2,4}-\d{5,}$", t):
+                # se NÃO estiver marcado explicitamente como PN:, ignoramos
+                if not re.search(rf"PN[:\s]+{re.escape(t)}\b", text, flags=re.IGNORECASE):
+                    continue
+
+        if is_tecsys:
+            # Códigos internos da Tecsys (ex: 00020020069), numéricos longos
+            # Se forem só números 9–12 dígitos e NÃO estiverem em "PN: 00020020069", ignoramos
+            if t.isdigit() and 9 <= len(t) <= 12:
+                if not re.search(rf"PN[:\s]+{re.escape(t)}\b", text, flags=re.IGNORECASE):
+                    continue
+
+        # ---------------------------
+        # 3) REGRAS ESPECIAIS PARA NÚMEROS PUROS
+        # ---------------------------
+
+        if t.isdigit():
+            # HS CODE / SCHEDULE B: 10 dígitos (3926909989, 8536694030, etc.)
+            if len(t) == 10:
+                continue
+
+            # Só aceitaremos números longos como PN se aparecerem explicitamente após "PN:"
+            if re.search(rf"PN[:\s]+{re.escape(t)}\b", text, flags=re.IGNORECASE):
+                if t not in final:
+                    final.append(t)
+            # Se não estiver próximo de "PN:", descartamos
+            continue
+
+        # ---------------------------
+        # 4) ACEITAÇÕES GERAIS (ALFA + DÍGITO)
+        # ---------------------------
+
+        has_alpha = any(c.isalpha() for c in t)
+        has_digit = any(c.isdigit() for c in t)
+
+        # a) PNs alfanuméricos "clássicos" (sem depender de hífen)
+        #    Ex: LTC3625EDE, AHEF1000, CL10C330JB8NNNC, GRM1885C1H180JA01D
+        if len(t) >= 6 and has_alpha and has_digit:
+            if t not in final:
+                final.append(t)
+            continue
+
+        # b) híbridos com hífen e comprimento razoável
+        #    Ex: STPS5H100B-TR, ESD7C3.3DT5G, ECS-3225Q-33-260-BS-TR
+        if "-" in t and has_digit and len(t) >= 8:
+            if t not in final:
+                final.append(t)
+            continue
+
+        # c) fallback: tokens longos com letras (casos exóticos)
+        if len(t) >= 12 and has_alpha:
+            if t not in final:
+                final.append(t)
+            continue
+
+    return final