-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsummarize-enron.py
More file actions
99 lines (64 loc) · 3.48 KB
/
summarize-enron.py
File metadata and controls
99 lines (64 loc) · 3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import argparse
import sys
if __name__ == "__main__":
path = sys.argv[1]
#import data
df = pd.read_csv(path, header = None)
#asign column names
df.columns = ['time','message identifier','sender','recipients','topic','mode']
#convert time format and add a new time column with granualar to specific month
df['real time']=pd.to_datetime(df['time'],unit='ms').dt.to_period('M')
##Part1
#count total emails sent by each person
sender_stats = df[['sender','message identifier']].groupby(['sender']).count().reset_index()
sender_stats= sender_stats.sort_values(by=['message identifier'],ascending=False).rename(columns={'message identifier':'sent','sender':'person'})
#seperate recipients
df['recipient']=df['recipients'].str.split('|')
#list seperated recipients in additional rows
df_new = df.explode('recipient')
#count total emails received by each person
recipient_stats = df_new[['recipient','message identifier']].groupby(['recipient']).count().reset_index()
recipient_stats = recipient_stats.sort_values(by=['message identifier'],ascending=False).rename(columns={'message identifier':'received', 'recipient':'person'})
#get total sent and received emails by each person
final = sender_stats.merge(recipient_stats,how='outer',on='person').sort_values(by=['sent'],ascending=False).fillna(0)
final.to_csv('email_by_person.csv')
##Part 2
#get top five sender
top_sender = final['person'][0:5].values.tolist()
#filter records for top 5 senders
sender_df = df.loc[df['sender'].isin(top_sender), ['real time','sender','message identifier']]
#counts total emails sent by each of top 5 sender by month
sender_df_agg = sender_df.groupby(['sender','real time']).count().rename(columns={'message identifier':'counts'}).reset_index()
#plot by each sender
fig, ax = plt.subplots(figsize=(16,8))
for label, subgroup in sender_df_agg.groupby('sender'):
subgroup.plot(x='real time', y='counts', ax=ax, label=label,title='Emails Sent by the Top 5 Senders')
plt.legend()
ax.set_xlabel("Time Line")
ax.set_ylabel("Total Email Counts")
#save results to png file
fig.savefig('Emails Sent by the Top 5 Senders.png')
## Part 3
#filter emails received by the top 5 senders
receive_df = df_new.loc[df_new['recipient'].isin(top_sender), ['real time','recipient','sender']].reset_index(drop=True)
#count unique senders who sent emails to each of top 5 senders each month
receive_df_agg = pd.DataFrame(receive_df.groupby(['real time','recipient'])['sender'].nunique()).reset_index()
#calculate the sum of total unique senders by month
receive_sum = receive_df_agg.groupby(['real time']).sum().rename(columns={'sender':'sum'}).reset_index()
#merge the two datasets together
receive_df_agg = receive_df_agg.merge(receive_sum, how='left',on='real time')
#create a new column to measure the proportion of the each sender
receive_df_agg['proportion'] = receive_df_agg['sender']/receive_df_agg['sum']*100
#pivot table for visualization
receive_df_agg_pivot = receive_df_agg.pivot(index = 'real time',columns = 'recipient',values= 'proportion').fillna(0)
#plot the results
fig2, ax2 = plt.subplots(figsize=(16,8))
receive_df_agg_pivot.plot.bar(stacked=True, title = 'Percentage of People Who Contacted the Top 5 Senders', ax=ax2)
ax2.set_xlabel("Time Line")
ax2.set_ylabel("Relative Number of People as Percentage")
#save as PNG file
fig2.savefig('Percentage of People Who Contacted the Top 5 Senders.png')
print('mission complete')