-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoperations.py
More file actions
158 lines (129 loc) · 6.19 KB
/
operations.py
File metadata and controls
158 lines (129 loc) · 6.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from mem0 import MemoryClient
import os
from llama_parse import LlamaParse
import re
from enum import Enum
class ContextType(Enum):
CV = "cv_text"
JOB_DESCRIPTION = "job_description"
INTERVIEWER_INFO = "interviewer_info"
class PreprOperations:
def __init__(self, user_id: str):
self.user_id = user_id
self.memory_client = self.get_memory_client()
self.parser = self.get_parser()
def get_memory_client(self) -> MemoryClient:
mem0_api_key = os.getenv("MEM0_API_KEY")
if not mem0_api_key:
raise ValueError("MEM0_API_KEY not found in environment variables")
return MemoryClient(api_key=mem0_api_key)
def get_parser(self) -> LlamaParse:
llama_parse_api_key = os.getenv("LLAMA_PARSE_API_KEY")
if not llama_parse_api_key:
raise ValueError("LLAMA_PARSE_API_KEY not found in environment variables")
return LlamaParse(api_key=llama_parse_api_key, result_type="text", verbose=False)
def parse_cv(self, cv_file_path: str) -> str:
"""Parse CV PDF using LlamaParse and anonymize the content."""
if not os.path.exists(cv_file_path):
return f"❌ Error: CV file not found: {cv_file_path}"
try:
print(f"🔄 Parsing CV: {cv_file_path}")
# Parse the PDF
documents = self.parser.load_data(cv_file_path)
# Extract text from parsed documents
cv_text = ""
for doc in documents:
cv_text += doc.text + "\n"
# Anonymize the text
anonymized_cv_text = self.anonymize_text(cv_text.strip())
# Store in context
return anonymized_cv_text
except Exception as e:
return f"❌ Error parsing CV: {str(e)}"
def _filter_memories(self, memories, context_type: ContextType):
return [x for x in memories if x["metadata"] == {'context_type': context_type.value}]
def load_context_from_memory(self, context_type: ContextType):
"""Restore specific context item from memory."""
try:
# Search for stored context items
all_memories = self.memory_client.get_all(
user_id=self.user_id
)
# Helper function needed, as SDK filter doesnt work
filtered_memories = self._filter_memories(all_memories, context_type)
if len(filtered_memories) == 0:
print(f"📝 No {context_type} found in memory")
return None
elif len(filtered_memories) > 1:
print(f"📝 Multiple {context_type} found in memory, using the most recent one")
filtered_memories = filtered_memories[-1]
else:
filtered_memories = filtered_memories[0]
print(f"📝 Found {context_type} in memory")
return filtered_memories["memory"]
except Exception as e:
print(f"Error loading {context_type} from memory: {e}")
return None
def store_context_in_memory(self, content: str, context_type: ContextType):
"""Store specific context items in memory with metadata."""
try:
messages = [
{"role": "user", "content": content}
]
self.memory_client.add(
messages,
user_id=self.user_id,
metadata={"context_type": context_type.value},
infer=False
)
print(f"✅ Stored {context_type} in memory")
except Exception as e:
print(f"❌ Error storing {context_type} in memory: {str(e)}")
def anonymize_text(self, text: str) -> str:
"""Anonymize company names and personal information in the text."""
anonymized_text = text
# Common patterns for names (basic anonymization)
name_patterns = [
r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\b', # First Last name pattern
r'\b[A-Z][a-z]+\s+[A-Z]\.\s+[A-Z][a-z]+\b', # First M. Last pattern
]
for pattern in name_patterns:
anonymized_text = re.sub(pattern, '[NAME]', anonymized_text)
# Common company patterns and known tech companies
company_patterns = [
r'\bGoogle\b', r'\bMicrosoft\b', r'\bApple\b', r'\bAmazon\b', r'\bMeta\b',
r'\bFacebook\b', r'\bNetflix\b', r'\bTesla\b', r'\bUber\b', r'\bAirbnb\b',
r'\bStripe\b', r'\bSpotify\b', r'\bSlack\b', r'\bZoom\b', r'\bDropbox\b',
r'\bSalesforce\b', r'\bOracle\b', r'\bIBM\b', r'\bIntel\b', r'\bNVIDIA\b',
r'\bAdobe\b', r'\bTwitter\b', r'\bLinkedIn\b', r'\bSquare\b', r'\bPayPal\b',
r'\bSiriusXM\b', r'\bPandora\b'
]
for pattern in company_patterns:
anonymized_text = re.sub(pattern, '[COMPANY]', anonymized_text, flags=re.IGNORECASE)
# Generic company patterns
company_suffixes = [
r'\b\w+\s+Inc\.?\b', r'\b\w+\s+LLC\.?\b', r'\b\w+\s+Corp\.?\b',
r'\b\w+\s+Corporation\b', r'\b\w+\s+Limited\b', r'\b\w+\s+Ltd\.?\b',
r'\b\w+\s+Company\b', r'\b\w+\s+Co\.?\b'
]
for pattern in company_suffixes:
anonymized_text = re.sub(pattern, '[COMPANY]', anonymized_text, flags=re.IGNORECASE)
# Email addresses
anonymized_text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', anonymized_text)
# Phone numbers
phone_patterns = [
r'\b\d{3}-\d{3}-\d{4}\b', # 123-456-7890
r'\b\(\d{3}\)\s*\d{3}-\d{4}\b', # (123) 456-7890
r'\b\d{3}\.\d{3}\.\d{4}\b', # 123.456.7890
r'\b\+\d{1,3}\s*\d{3}\s*\d{3}\s*\d{4}\b' # +1 123 456 7890
]
for pattern in phone_patterns:
anonymized_text = re.sub(pattern, '[PHONE]', anonymized_text)
return anonymized_text
def main():
operations = PreprOperations(user_id="interview_prep_agent")
# cv_text = operations.parse_cv("cv.pdf")
operations.store_context_in_memory("test", ContextType.CV)
# print(cv_text)
if __name__ == "__main__":
main()