quickie_covid_script/covid_analysis.py at master · drrelyea/quickie_covid_script · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# This script is atrocious. I made it in a few minutes one night. Don't judge me. =D

from glob import glob
import pandas as pd
import sys
import os
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt

def get_cmap(n, name='hsv'):
    '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
    RGB color; the keyword argument name must be a standard mpl colormap name.'''
    return plt.cm.get_cmap(name, n)

state_populations = pd.read_csv('state_populations.csv',usecols=['State','Pop'])


my_local_directory = '/Users/relyea/data/corona/'
if not os.path.exists(my_local_directory):
    os.mkdir(my_local_directory)
os.chdir(my_local_directory)
if not os.path.exists('COVID-19'):
    os.system('git clone https://github.com/CSSEGISandData/COVID-19.git')

thefiles = sorted(glob(my_local_directory + '/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/*.csv'))

# they changed format a few days ago, because reasons
# in fairness, the new format is a lot nicer
shortfiles = thefiles[0:60]
longfiles = thefiles[60:]

# load data from the short-format files
short_data = []
for filename in shortfiles:
    the_date = filename.split('/')[-1][0:10]

    # start on March
    if the_date[:2] != '01' and the_date[:2] != '02':
        df = pd.read_csv(filename, index_col=None, header=0)
        df['thedate'] = filename.split('/')[-1][0:10]
        short_data.append(df)

short_frame = pd.concat(short_data, axis=0, ignore_index=True)
short_usdata = short_frame[short_frame['Country/Region'] == 'US']
short_usdata = short_usdata[['Confirmed', 'Deaths', 'Province/State', 'thedate']]
short_usdata.columns = ['tot','ded','state','day']
short_usdata.fillna('0.',inplace=True)
short_usdata['ded'] = short_usdata.ded.astype(float)

# load data from the long-format files
long_data = []
for filename in longfiles:
    df = pd.read_csv(filename, index_col=None, header=0)

    # start on March
    df['thedate'] = filename.split('/')[-1][0:10]
    long_data.append(df)

long_frame = pd.concat(long_data, axis=0, ignore_index=True)
long_usdata = long_frame[long_frame['Country_Region'] == 'US']
long_usdata = long_usdata[['Confirmed', 'Deaths', 'Province_State', 'thedate']]
long_usdata.columns = ['tot','ded','state','day']
long_usdata.groupby(['state','day']).sum()

most_recent_date = filename.split('/')[-1][0:10]

# the_states = most_recent_values.sort_values(by='tot').index.tolist()[-30:]
the_states = state_populations['State'].values

draw_curves = True

# thetype = 'tot'
# the_pct = 0.03

thetype = 'ded'
the_pct = 0.006
data_to_plot_list = []
for istate,the_state in enumerate(the_states):
# pick a state, any state
# the_state = 'Connecticut'
    short_group = short_usdata[short_usdata.state == the_state][['tot','ded','day']].groupby('day').sum()
    long_group = long_usdata[long_usdata.state == the_state][['tot','ded','day']].groupby('day').sum()
    usdata = pd.concat([short_group,long_group])
    usdata.loc[usdata[thetype] == 0,thetype] = 1

    # reg = LinearRegression().fit(np.arange(len(usdata)).reshape(-1,1),np.log2(usdata.tot.values))
    # do the last 5 days
    n_most_recent_days = 5

    reg = LinearRegression().fit(np.arange(n_most_recent_days).reshape(-1,1),np.log2(usdata[thetype].values[-n_most_recent_days:]))
    overall_population = state_populations[state_populations.State == the_state].Pop.values[0]
    per_capita_total_infected = usdata['ded'][-1]*100/overall_population
    doubling_period_in_days = 1.0/reg.coef_[0]
    doubling_period_until_3pct = log2(0.03/per_capita_total_infected)*doubling_period_in_days
    print(
        the_state.ljust(15) +
        # ' confirmed: ' +
        # '%07d' % (most_recent_values.loc[the_state,'tot']) +
        ' dead: ' +
        ('%d' % (usdata['ded'][-1])).rjust(5) +
        # ' fakeratio: ' +
        # '%0.4f' % (most_recent_values.loc[the_state,'ded']/most_recent_values.loc[the_state,'tot']) +
        ' percap: ' +
        '%0.3f' % (per_capita_total_infected*100) + '%' +
        ' period/time to 3%: ' + '%1.1f' % doubling_period_in_days +
        ' %02d' % doubling_period_until_3pct
    )
    if draw_curves:
        data_to_plot_list.append(usdata['ded'].diff())
        clf()
        for item in data_to_plot_list:
            semilogy(item, alpha = 0.1)
        semilogy(usdata['ded'].diff(), alpha=1.0)
        draw()
        show()
        aaaaa=input()