-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathapp.py
More file actions
157 lines (128 loc) · 5.42 KB
/
app.py
File metadata and controls
157 lines (128 loc) · 5.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
import re
import spacy
import requests
import pdfplumber
import docx2txt
import phonenumbers
from flask import Flask, request, Response
from twilio.twiml.messaging_response import MessagingResponse
from dotenv import load_dotenv
from requests.auth import HTTPBasicAuth
from nameparser import HumanName
from google_sheets import append_row
from google_drive import upload_to_drive
# ---------------------- ENV SETUP ----------------------
load_dotenv()
SPREADSHEET_ID = os.getenv("SPREADSHEET_ID")
DRIVE_FOLDER_ID = os.getenv("DRIVE_FOLDER_ID")
TWILIO_ACCOUNT_SID = os.getenv("TWILIO_ACCOUNT_SID")
TWILIO_AUTH_TOKEN = os.getenv("TWILIO_AUTH_TOKEN")
nlp = spacy.load("en_core_web_sm")
app = Flask(__name__)
EMAIL_RE = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
HEADING_WORDS = {"resume", "curriculum", "vitae", "professional", "summary", "objective", "profile", "experience", "education", "skills"}
# ---------------------- HELPERS ----------------------
def extract_phone_numbers(text):
"""Use Google's phonenumbers library to extract valid mobile numbers."""
numbers = []
for match in phonenumbers.PhoneNumberMatcher(text, "IN"):
num = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164)
numbers.append(num)
return list(set(numbers))
def extract_basic(text):
"""Extract full name, email, and phone robustly."""
emails = EMAIL_RE.findall(text)
phones = extract_phone_numbers(text)
name = None
# 1️ Explicit "Name:" or "Full Name:"
match = re.search(r'(?i)(?:full\s*name|name)\s*[:\-]\s*([A-Za-z][A-Za-z\s\-]+)', text)
if match:
name = match.group(1).strip()
# 2️ Try top few lines — name usually appears early
if not name or len(name.split()) < 2:
lines = [line.strip() for line in text.split('\n') if line.strip()]
for line in lines[:10]:
if any(word in line.lower() for word in HEADING_WORDS):
continue
if EMAIL_RE.search(line) or extract_phone_numbers(line):
continue
if not any(char.isdigit() for char in line):
parsed = HumanName(line)
if parsed.first and parsed.last:
name = str(parsed)
break
elif len(line.split()) <= 3:
name = line
break
# 3️ Fallback: spaCy PERSON entities
if not name:
doc = nlp(text)
names = [ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"]
if names:
name = next((n for n in names if len(n.split()) >= 2), names[0])
# 4️ Fallback: email-based
if (not name or len(name.split()) < 2) and emails:
local = re.sub(r'[\d\._\-]+', ' ', emails[0].split("@")[0])
name = " ".join(w.capitalize() for w in local.split())
return (name or "Unknown").strip(), emails, phones
def extract_text_from_pdf(file_path):
text = ""
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text += (page.extract_text() or "") + "\n"
return text
def extract_text_from_docx(file_path):
return docx2txt.process(file_path).strip()
# ---------------------- ROUTES ----------------------
@app.route("/")
def home():
return "WhatsApp Resume Parser is running!"
@app.route("/whatsapp", methods=["POST"])
def whatsapp():
from_number = request.values.get("From", "")
body = request.values.get("Body", "").strip()
media_url = request.values.get("MediaUrl0")
media_type = request.values.get("MediaContentType0")
text_content = body
drive_link = ""
if media_url:
print(f"Received file: {media_url} ({media_type})")
file_path = None
if "pdf" in media_type:
file_path = "resume.pdf"
elif "docx" in media_type or "word" in media_type:
file_path = "resume.docx"
if file_path:
r = requests.get(media_url, auth=HTTPBasicAuth(TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN))
if r.status_code == 200:
with open(file_path, "wb") as f:
f.write(r.content)
print("File downloaded successfully ")
else:
print("Failed to download file:", r.status_code)
return Response(str(MessagingResponse().message("Failed to download your resume. Please try again.")), mimetype="application/xml")
try:
drive_link = upload_to_drive(file_path, DRIVE_FOLDER_ID)
except Exception as e:
print("Drive upload error:", e)
if file_path.endswith(".pdf"):
text_content = extract_text_from_pdf(file_path)
elif file_path.endswith(".docx"):
text_content = extract_text_from_docx(file_path)
# Extract structured info
name, emails, phones = extract_basic(text_content)
print("EXTRACTED ->", name, emails, phones)
# Save to Google Sheet
row = [from_number, name, ",".join(emails), ",".join(phones), text_content[:200], drive_link]
try:
append_row(SPREADSHEET_ID, row)
except Exception as e:
print("Sheet error:", e)
# WhatsApp reply
resp = MessagingResponse()
msg = f"Got your resume!\nName: {name}\nEmail: {','.join(emails) or 'N/A'}\nPhone: {','.join(phones) or 'N/A'}\nSaved to Drive 📂"
resp.message(msg)
return Response(str(resp), mimetype="application/xml")
if __name__ == "__main__":
app.run(port=5000, debug=True)