-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconvert_data_to_csv.py
More file actions
282 lines (210 loc) · 9.41 KB
/
convert_data_to_csv.py
File metadata and controls
282 lines (210 loc) · 9.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
"""Convert data written by sender and receivers to csv files.
Plan:
1) Put all senders data files into a single csv file.
2) Put all receivers data files into a single csv file.
3) Create a summary csv file that broken by phases which includes the total
energy, total time, number of candidate, average throughput, etc.
4) Combine all summaries csv into a single csv file.
5) Check that if a folder has been processed then it should not be processed.
______________________________________________________________
# Scattered ideas:
Load balancing? how good it's?
"""
# add a function recv_msg_stat_parse() to parse a single message's
# statistics. It will return a dictionary that has all the properties.
# I am thinking of embedding this inside parse_receiver
# add a function recv_cnd_stat that parse the information about a found
# candidate.
# These two functions should write directly to the csv file?
# New thought, we should not deep nest the csv file, only parse sender writes
# to it. So, the above two functions should return a one line string to be
# added to the common csv file.
# WARNING: @todo senders and receivers don't agree on INTERVAL!
# see the folder long_message_attack/experiments/N10_nstates1099511627776_nsenders319_nreceivers609_diff3/data/stats$
def extract_receiving_stats(text):
"""Extract the data from stats of message received into a csv line."""
import re
# Extract the data from the text using regular expressions
matches = re.findall(r'total=(\d+\.\d+)sec', text)
total = matches[0]
matches = re.findall(r'mpi_recv=(\d+\.\d+)sec', text)
mpi_recv = matches[0]
matches = re.findall(r'dict_add=(\d+\.\d+)sec', text)
dict_add = matches[0]
matches = re.findall(r'dict_add=(\d+\.\d+)sec≈2\^(\d+\.\d+)≈(\d+\.\d+)MB/sec', text)
dict_add_speed = matches[0][2]
matches = re.findall(r'mpi_recv=(\d+\.\d+)%', text)
mpi_recv_percent = matches[0]
matches = re.findall(r'dict_add=(\d+\.\d+)%', text)
dict_add_percent = matches[0]
matches = re.findall(r'RECV (\d+\.\d+)MB/sec', text)
recv_speed = matches[0]
matches = re.findall(r'exp\[all receivers\] = (\d+\.\d+) MB/sec', text)
exp_all_receivers = matches[0]
matches = re.findall(r'nsenders=(\d+)', text)
nsenders = matches[0]
matches = re.findall(r'nservers=(\d+)', text)
nservers = matches[0]
matches = re.findall(r'DIFFICULTY=(\d+)', text)
difficulty = matches[0]
matches = re.findall(r'INTERVAL=(\d+)', text)
interval = matches[0]
matches = re.findall(r'nmsgs_recv=(\d+)', text)
nmsgs_recv = matches[0]
# Combine the extracted data into a CSV line
csv_line = f"{total},{mpi_recv},{dict_add},{dict_add_speed},{mpi_recv_percent},{dict_add_percent},{recv_speed},{exp_all_receivers},{nsenders},{nservers},{difficulty},{interval},{nmsgs_recv}"
return csv_line
def extract_candidates_stats(text):
"""Extract number of newly found candidates and accumulated ncandidates."""
import re
# Extract the data from the text using regular expressions
matches = re.findall(r'nfound_cnd=(\d+)', text)
nfound_cnd = matches[0]
matches = re.findall(r'new_cnd=(\d+)', text)
new_cnd = matches[0]
# matches = re.findall(r't=(\d+\.\d+)sec', text)
# t = matches[0]
# Combine the extracted data into a CSV line
csv_line = f"{nfound_cnd},{new_cnd}"
return csv_line
def parse_receiver_file(f_inp, f_csv):
"""Extract data from f_inp and write the extracted data to csv_name."""
import re
matches = re.findall(r"(\d+)", f_inp.name)
# print(f"file_name={f_inp.name}")
receiver_name = matches[0]
# print(f"receiver_name={receiver_name}")
bracket = "<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-\n"
# read the first 6 lines which corresponds to rehashing a message.
is_bracket_open = False
text = ""
# extract data for rehashing the message
for _ in range(6):
text += f_inp.readline()
csv_line = extract_receiving_stats(text)
# two entries: nnew_candidates, nfound_candidates, we ignore time
csv_line = csv_line + ",0,0," + receiver_name + "\n"
f_csv.write(csv_line)
# reset variables
text = ""
is_bracket_open = False
next_line_candidates = False
print(f"DONE with regen stats {receiver_name} receiver")
# Actual parsing:
for line in f_inp: # complete where stopped before
# print(f"bracket={bracket == line}, {line}")
if line == bracket:
tmp = is_bracket_open
is_bracket_open = (is_bracket_open + 1) % 2
# print(f"is_bracket_open={is_bracket_open}")
# we just closed a bracket that was open, i.e. receiving stats
if tmp and not is_bracket_open:
# treat the text
text += line
# print(f"text={text}")
csv_line = extract_receiving_stats(text)
# print("+"*40)
# print(f"inside brackets:\n{text}")
# print(csv_line)
# print("+"*40)
f_csv.write(csv_line)
text = ""
next_line_candidates = True
continue # don't do extra computation
# or we only read a cadnidates statistics
if line != bracket and next_line_candidates:
text = line
csv_line = "," + extract_candidates_stats(text)
csv_line += f",{receiver_name}\n"
# print("+"*40)
# print(f"candidates {text}")
# print(csv_line)
# print("+"*40)
f_csv.write(csv_line)
# @todo add receiver name!
text = "" # reset the text
next_line_candidates = False
continue # don't do extra computation
# Accumlate receiving statistics text
text += line
def parse_receivers():
"""Process all receiver_* files and store the result in sender.csv.
This function doesn't change path, it assumes that we are in stats/ folder.
"""
import os
file_names = os.listdir("data/stats/") # get all files names that start with
file_names = filter(lambda x: "receiver_" in x, file_names)
file_names = [os.path.join("data/stats/", f_name) for f_name in file_names]
# print(f"files are {file_names}")
csv_file = open("data/stats/receivers.csv", "w")
csv_file.write("time_sec,mpi_recv_sec,dict_add_sec,dict_add_speed_MB,mpi_recv_percent,dict_add_percent,recv_speed_MB,exp_all_receivers_MB,nsenders,nreceivers,difficulty,interval,nmsgs_recv,nfound_cnd,new_cnd,receiver_name\n")
# add csv header
# todo
for f in file_names:
receiver_file = open(f, "r")
parse_receiver_file(receiver_file, csv_file)
def extract_sender_stat(text):
"""Extract sender stats into csv format."""
import re
# Remove delimiters
text = text.replace("->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->", "")
# Split text by lines
lines = text.strip().split('\n')
# Initialize list to store extracted data
data = []
# Extract data from each line
for line in lines:
# Use regular expressions to extract numeric values
values = re.findall(r'[\d.]+', line)
data.extend(values)
# Convert list to CSV format
csv_data = ",".join(data)
return csv_data
def parse_sender_file(f_inp, f_csv):
"""Loop over all senders stats and collect them inside single csv file."""
import re
matches = re.findall(r"(\d+)", f_inp.name)
# print(f"file_name={f_inp.name}")
sender_name = matches[0]
# print(f"i.e. name={sender_name}")
bracket = "->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->\n"
is_bracket_open = False
text = ""
# The actual parsing
for line in f_inp:
if line == bracket:
tmp = is_bracket_open
is_bracket_open = (is_bracket_open + 1) % 2
# we just closed a bracket that was open
if tmp and not is_bracket_open:
text += line
# print("+++++++++++++++++++++++++++++++++")
# print(f"{text}")
# print("++++++++++++++++++++++++++++++++")
csv_line = extract_sender_stat(text)
csv_line += f",{sender_name}\n"
# print(f"sender {sender_name} going to write\n{csv_line}")
f_csv.write(csv_line)
text = ""
continue # don't do any other computation
text += line
def parse_senders():
"""
Process all sender_* files and store the result in sender.csv.
This function doesn't change path, it assumes that we are in stats/ folder.
"""
import os
file_names = os.listdir("data/stats/") # get all files names that start with
file_names = filter(lambda x: "sender_" in x, file_names)
file_names = [os.path.join("data/stats/", f_name) for f_name in file_names]
# print(f"files are {file_names}")
csv_file = open("data/stats/senders.csv", "w")
csv_file.write("time,mpi_wait_sec,hash_sec,hash_speed,hash_speed_MB,find_dist_sec,mpi_send_percent,hash_percent,find_dist_percent,send_speed_MB,exp_all_senders_speed_MB,nsenders,nreceivers,difficulty,interval,nsends,sender_name\n")
# add csv header
# todo
for f in file_names:
print(f"Going to treat {f}")
sender_file = open(f, "r")
parse_sender_file(sender_file, csv_file)
parse_receivers()
parse_senders()