Whatsapp_Parser/app.py at main · Jim0330/Whatsapp_Parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
import re
import spacy
import requests
import pdfplumber
import docx2txt
import phonenumbers
from flask import Flask, request, Response
from twilio.twiml.messaging_response import MessagingResponse
from dotenv import load_dotenv
from requests.auth import HTTPBasicAuth
from nameparser import HumanName
from google_sheets import append_row
from google_drive import upload_to_drive

# ---------------------- ENV SETUP ----------------------
load_dotenv()
SPREADSHEET_ID = os.getenv("SPREADSHEET_ID")
DRIVE_FOLDER_ID = os.getenv("DRIVE_FOLDER_ID")
TWILIO_ACCOUNT_SID = os.getenv("TWILIO_ACCOUNT_SID")
TWILIO_AUTH_TOKEN = os.getenv("TWILIO_AUTH_TOKEN")

nlp = spacy.load("en_core_web_sm")
app = Flask(__name__)

EMAIL_RE = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
HEADING_WORDS = {"resume", "curriculum", "vitae", "professional", "summary", "objective", "profile", "experience", "education", "skills"}


# ---------------------- HELPERS ----------------------
def extract_phone_numbers(text):
    """Use Google's phonenumbers library to extract valid mobile numbers."""
    numbers = []
    for match in phonenumbers.PhoneNumberMatcher(text, "IN"):
        num = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164)
        numbers.append(num)
    return list(set(numbers))

def extract_basic(text):
    """Extract full name, email, and phone robustly."""
    emails = EMAIL_RE.findall(text)
    phones = extract_phone_numbers(text)
    name = None

    # 1️ Explicit "Name:" or "Full Name:"
    match = re.search(r'(?i)(?:full\s*name|name)\s*[:\-]\s*([A-Za-z][A-Za-z\s\-]+)', text)
    if match:
        name = match.group(1).strip()

    # 2️ Try top few lines — name usually appears early
    if not name or len(name.split()) < 2:
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        for line in lines[:10]:
            if any(word in line.lower() for word in HEADING_WORDS):
                continue
            if EMAIL_RE.search(line) or extract_phone_numbers(line):
                continue
            if not any(char.isdigit() for char in line):
                parsed = HumanName(line)
                if parsed.first and parsed.last:
                    name = str(parsed)
                    break
                elif len(line.split()) <= 3:
                    name = line
                    break

    # 3️ Fallback: spaCy PERSON entities
    if not name:
        doc = nlp(text)
        names = [ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"]
        if names:
            name = next((n for n in names if len(n.split()) >= 2), names[0])

    # 4️ Fallback: email-based
    if (not name or len(name.split()) < 2) and emails:
        local = re.sub(r'[\d\._\-]+', ' ', emails[0].split("@")[0])
        name = " ".join(w.capitalize() for w in local.split())

    return (name or "Unknown").strip(), emails, phones

def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += (page.extract_text() or "") + "\n"
    return text


def extract_text_from_docx(file_path):
    return docx2txt.process(file_path).strip()


# ---------------------- ROUTES ----------------------
@app.route("/")
def home():
    return "WhatsApp Resume Parser is running!"


@app.route("/whatsapp", methods=["POST"])
def whatsapp():
    from_number = request.values.get("From", "")
    body = request.values.get("Body", "").strip()
    media_url = request.values.get("MediaUrl0")
    media_type = request.values.get("MediaContentType0")

    text_content = body
    drive_link = ""

    if media_url:
        print(f"Received file: {media_url} ({media_type})")
        file_path = None
        if "pdf" in media_type:
            file_path = "resume.pdf"
        elif "docx" in media_type or "word" in media_type:
            file_path = "resume.docx"

        if file_path:
            r = requests.get(media_url, auth=HTTPBasicAuth(TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN))
            if r.status_code == 200:
                with open(file_path, "wb") as f:
                    f.write(r.content)
                print("File downloaded successfully ")
            else:
                print("Failed to download file:", r.status_code)
                return Response(str(MessagingResponse().message("Failed to download your resume. Please try again.")), mimetype="application/xml")

            try:
                drive_link = upload_to_drive(file_path, DRIVE_FOLDER_ID)
            except Exception as e:
                print("Drive upload error:", e)

            if file_path.endswith(".pdf"):
                text_content = extract_text_from_pdf(file_path)
            elif file_path.endswith(".docx"):
                text_content = extract_text_from_docx(file_path)

    # Extract structured info
    name, emails, phones = extract_basic(text_content)
    print("EXTRACTED ->", name, emails, phones)

    # Save to Google Sheet
    row = [from_number, name, ",".join(emails), ",".join(phones), text_content[:200], drive_link]
    try:
        append_row(SPREADSHEET_ID, row)
    except Exception as e:
        print("Sheet error:", e)

    # WhatsApp reply
    resp = MessagingResponse()
    msg = f"Got your resume!\nName: {name}\nEmail: {','.join(emails) or 'N/A'}\nPhone: {','.join(phones) or 'N/A'}\nSaved to Drive 📂"
    resp.message(msg)
    return Response(str(resp), mimetype="application/xml")


if __name__ == "__main__":
    app.run(port=5000, debug=True)