-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessor.py
More file actions
79 lines (66 loc) · 2.68 KB
/
preprocessor.py
File metadata and controls
79 lines (66 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd
import re
def preprocess(data):
# Step 1: Sabhi invisible characters (\u200e, \ufeff, etc.) ko saaf karo
# Ye characters hi saari fasad ki jadh hain.
data = re.sub(r'[\u200e\u200f\ufeff\u202a\u202c\u202d\u2066\u2069]', '', data)
lines = data.split('\n')
dates = []
messages = []
for line in lines:
line = line.strip()
if not line:
continue
# Aapka format: '19/08/24, 22:38 - User: Message'
# Hum ' - ' divider dhoond rahe hain jo date aur message ko alag karta hai
if ' - ' in line:
parts = line.split(' - ', 1)
prefix = parts[0].strip() # Ye '19/08/24, 22:38' hona chahiye
# Check: Kya prefix mein '/' aur ':' hai? (Date aur Time ki pehchan)
if prefix.count('/') >= 2 and ':' in prefix:
try:
# dayfirst=True India ke DD/MM/YY format ke liye zaroori hai
# Hum comma(',') hata rahe hain parsing ke liye
dt = pd.to_datetime(prefix.replace(',', ''), dayfirst=True, errors='raise')
dates.append(dt)
messages.append(parts[1])
continue
except:
pass
# Agar line date nahi hai, toh wo picchle message ki continuation hai
if messages:
messages[-1] += '\n' + line
if not dates:
return pd.DataFrame()
df = pd.DataFrame({'date': dates, 'user_message': messages})
# Step 2: User aur Message ko alag karo (Pehla colon ': ' dhoond kar)
users = []
clean_msgs = []
for msg in df['user_message']:
entry = msg.split(': ', 1)
if len(entry) == 2:
users.append(entry[0].strip())
clean_msgs.append(entry[1].strip())
else:
users.append('group_notification')
clean_msgs.append(entry[0].strip())
df['user'] = users
df['message'] = clean_msgs
df.drop(columns=['user_message'], inplace=True)
# Step 3: Saare Time Features extract karo
df['year'] = df['date'].dt.year
df['month_num'] = df['date'].dt.month
df['month'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day
df['day_name'] = df['date'].dt.day_name()
df['only_date'] = df['date'].dt.date
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
# Heatmap ke liye period column (Example: 22-23)
period = []
for hour in df['hour']:
if hour == 23: period.append("23-00")
elif hour == 0: period.append("00-01")
else: period.append(f"{int(hour):02d}-{int(hour)+1:02d}")
df['period'] = period
return df