-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdaily_sidestream_counts.py
More file actions
executable file
·130 lines (111 loc) · 4.33 KB
/
daily_sidestream_counts.py
File metadata and controls
executable file
·130 lines (111 loc) · 4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
#%%
import os
import math
import pandas as pd
import numpy as np
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import matplotlib.ticker
import datetime
import collections
# Some matplotlib features are version dependent.
assert(matplotlib.__version__ >= '2.1.2')
# Depends on: pip install --upgrade google-cloud-bigquery
from google.cloud import bigquery
def run_query(query, project='mlab-sandbox'):
client = bigquery.Client(project=project)
job = client.query(query)
results = collections.defaultdict(list)
for row in job.result(timeout=300):
for key in row.keys():
results[key].append(row.get(key))
return pd.DataFrame(results)
#%%
df_ss_count = run_query(
"""#standardSQL
CREATE TEMPORARY FUNCTION sliceFromIP(test_id STRING, ipaddr STRING)
AS (
CASE
WHEN REGEXP_CONTAINS(test_id, r"mlab1.(atl01|ord01|sea01|lga02)") AND MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10 = 10
THEN 7
WHEN REGEXP_CONTAINS(test_id, r"mlab1.(dfw01|lax01)") AND MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10 = 11
THEN 7
ELSE
MOD(CAST(REGEXP_EXTRACT(ipaddr, r'[:.]([0-9]+)$') AS INT64), 64) - 10
END
);
SELECT
hostname, ts, count(*) as count
FROM (
SELECT
REGEXP_EXTRACT(test_id, r"\d\d\d\d/\d\d/\d\d/(mlab[1-4].[a-z]{3}[0-9]{2})") AS hostname,
UNIX_SECONDS(TIMESTAMP_TRUNC(log_time, DAY)) AS ts
FROM
`mlab-sandbox.gfr.sidestream_*`
WHERE
REGEXP_CONTAINS(test_id, r"mlab1.(dfw|lga|iad|lax|atl|den|sea|nuq|ord|mia)[0-9]{2}.*")
-- AND sliceFromIP(test_id, web100_log_entry.connection_spec.local_ip) = 7
AND web100_log_entry.snap.HCThruOctetsAcked >= 1000000
AND (web100_log_entry.snap.SndLimTimeRwin +
web100_log_entry.snap.SndLimTimeCwnd +
web100_log_entry.snap.SndLimTimeSnd) >= 9000000
AND (web100_log_entry.snap.SndLimTimeRwin +
web100_log_entry.snap.SndLimTimeCwnd +
web100_log_entry.snap.SndLimTimeSnd) < 600000000
AND (web100_log_entry.snap.State = 1 OR
(web100_log_entry.snap.State >= 5 AND
web100_log_entry.snap.State <= 11))
GROUP BY
hostname, ts,
web100_log_entry.connection_spec.remote_ip,
web100_log_entry.connection_spec.remote_port,
web100_log_entry.connection_spec.local_port,
web100_log_entry.connection_spec.local_ip
)
GROUP BY
hostname, ts
ORDER BY
hostname, ts
""")
print 'Done', len(df_ss_count)
#%%
sites = [
['dfw', 'lga', 'iad'],
['lax', 'atl', 'den'],
['sea', 'nuq', 'ord'], # MIA is low utilization.
]
# MIA, DEN, and SEA are relatively low utilization.
# NUQ, ORD show trends less dramatic than those below.
# LGA usage appeared to dramatically lower around 2018-01.
cols = len(sites[0])
fig = plt.figure(figsize=(4 * cols, 4 * cols))
axes = [
[None] * cols,
[None] * cols,
[None] * cols,
]
for r, siter in enumerate(sites):
for c, site in enumerate(siter):
axes[r][c] = plt.subplot2grid((cols, cols), (r, c))
if c == 0:
axes[r][c].set_ylabel('Connection Counts')
#else:
# axes[r][c].set_yticklabels([])
if r != 2:
axes[r][c].set_xticklabels([])
prefix = 'mlab1.' + site
ds_sites = df_ss_count[ df_ss_count['hostname'].str.contains(prefix) ]
for host in sorted(set(ds_sites['hostname'])):
ds = ds_sites[ (ds_sites['hostname'].str.contains(host)) ]
axes[r][c].plot_date(
dates.epoch2num(ds['ts']),
ds['count'],
ls='-', ms=0, label=host[6:11])
axes[r][c].set_title(site)
#axes[r][c].set_ylim(0, 25000)
axes[r][c].tick_params(axis='x', labelrotation=90)
axes[r][c].grid(color='#dddddd')
axes[r][c].legend(loc=2, fontsize='x-small', ncol=2)
fig.suptitle('Daily TCP Connection Counts Per Metro - SamKnows Only')
plt.show()