-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathversion3.py
More file actions
151 lines (118 loc) · 5.16 KB
/
version3.py
File metadata and controls
151 lines (118 loc) · 5.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import string
from PyPDF2 import PdfReader
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
current_dir=os.path.dirname(os.path.abspath(__file__))
resume_file_path=os.path.join(current_dir,"resume.pdf")
job_description_file_path=os.path.join(current_dir,"job_description.txt")
class ATSScanner:
def __init__(self):
try:
nltk.data.find('corpora/stopwords')
nltk.data.find('tokenizers/punkt')
except:
print("Downloading nltk datapackages..")
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
self.stemmer=PorterStemmer()
boilerplate = {
"job", "title", "candidate", "description", "responsibilities",
"requirements", "summary", "seeking", "ideal", "must", "key",
"preferred", "previous", "prior", "professional", "program",
"dedicated", "diverse", "assist", "support", "work", "role",
"experience", "position", "qualifications", "skills"
}
self.stop_words=set(stopwords.words("english")).union(boilerplate)
def clean_text(self, text):
# Lowercase
text = text.lower()
words=word_tokenize(text)
cleaned_word=set()
for word in words:
if not word.isalpha():
continue
if word in self.stop_words:
continue
root_words=self.stemmer.stem(word)
cleaned_word.add(root_words)
return cleaned_word
def extract_text_from_resume(self,input_file):
try:
reader=PdfReader(input_file)
text=""
for page in reader.pages:
text+=page.extract_text()
return text
except Exception as e:
return f"Error:{e} has occurred"
def calculate_cosine_similarity(self, resume_text, jd_text):
"""
Converts text to Vectors and calculates the Cosine Similarity.
This is much smarter than simple keyword matching.
"""
# 1. Clean both texts first (using your existing cleaner)
# We join the set back into a string because CountVectorizer expects strings
resume_cleaned = " ".join(self.clean_text(resume_text))
jd_cleaned = " ".join(self.clean_text(jd_text))
text_list = [resume_cleaned, jd_cleaned]
# 2. Vectorize (Turn words into math)
cv = CountVectorizer()
count_matrix = cv.fit_transform(text_list)
# 3. Calculate Similarity (0.0 to 1.0)
match_percentage = cosine_similarity(count_matrix)[0][1] * 100
return round(match_percentage, 2)
def scan(self, resume_text, job_desc_text):
resume_set = self.clean_text(resume_text)
jd_set = self.clean_text(job_desc_text)
# 1. Calculate the Match
# Intersection (&) finds words in BOTH sets
matches = resume_set.intersection(jd_set)
# 2. Calculate Missing Keywords
# Difference (-) finds words in JD but NOT in Resume
missing = jd_set.difference(resume_set)
# 3. Calculate Score
# (Matches / Total Unique JD Words) * 100
if len(jd_set) == 0:
return 0, set(), set()
score = (len(matches) / len(jd_set)) * 100
return score, matches, missing
# --- RUNNER CODE ---
if __name__ == "__main__":
scanner = ATSScanner()
print("--- ATS RESUME HACKER v3.0 ---")
if resume_file_path.endswith(".pdf"):
resume_input=scanner.extract_text_from_resume(input_file=resume_file_path)
else:
with open(resume_file_path,"rb") as filp:
resume_input=filp.read()
with open(job_description_file_path,"r",encoding="utf-8") as filp:
jd_input=filp.read()
# Run the Logic
score, matches, missing = scanner.scan(resume_input, jd_input)
cosine_score = scanner.calculate_cosine_similarity(resume_input, jd_input)
# Output the Report
print("\n" + "="*30)
print(f"MATCH SCORE: {score:.1f}%")
print(f"COSINE (AI) SCORE: {cosine_score:.1f}%")
print("="*30)
print(f"\n✅ MATCHED KEYWORDS ({len(matches)}):")
print(", ".join(sorted(list(matches))))
print(f"\n⚠️ MISSING CRITICAL KEYWORDS ({len(missing)}):")
# We highlight these in RED using ANSI codes (Terminal hack)
RED = "\033[91m"
RESET = "\033[0m"
for word in sorted(list(missing)):
print(f"{RED}- {word}{RESET}")
print("\n" + "="*30)
if cosine_score < 50:
print("Verdict: 🗑️ AUTO-REJECTED. You need to add the missing keywords.")
elif score < 80:
print("Verdict: ⚠️ MAYBE. Human might read it if lucky.")
else:
print("Verdict: 🚀 INTERVIEW LIKELY. Great job.")