-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
130 lines (107 loc) · 5.65 KB
/
utils.py
File metadata and controls
130 lines (107 loc) · 5.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import zipfile
DOWNLOAD_FOLDER = './'
DATA_FOLDER = os.path.join(DOWNLOAD_FOLDER, 'data')
measures = ['DALYs (Disability-Adjusted Life Years)',
'YLDs (Years Lived with Disability)',
'Deaths',
'YLLs (Years of Life Lost)']
remap = {"Bolivia, Plurinational State of": 'Bolivia (Plurinational State of)',
"Congo, Democratic Republic of the": "Democratic Republic of the Congo",
'Iran, Islamic Republic of':'Iran (Islamic Republic of)',
"Korea, Democratic People's Republic of":"Democratic People's Republic of Korea",
'Korea, Republic of': 'Republic of Korea',
'Micronesia, Federated States of':'Micronesia (Federated States of)',
'Moldova, Republic of':'Republic of Moldova',
'Netherlands, Kingdom of the':'Netherlands',
'Tanzania, United Republic of':'United Republic of Tanzania',
'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
'Venezuela, Bolivarian Republic of':'Venezuela (Bolivarian Republic of)'
}
def load_regional_life_expectancy(year):
LE_path = os.path.join(DATA_FOLDER, 'LifeExpectancy.csv')
regional_path = os.path.join(DATA_FOLDER, 'all.csv')
le_benchmarks = pd.read_csv(LE_path, index_col='Country Name')[str(year)]
regions = pd.read_csv(regional_path,index_col=0)
regional_le = regions[['region']].merge(le_benchmarks,left_index=True,right_index=True)
regional_le['regional_75'] = regional_le.groupby('region').transform(lambda x: np.percentile(x,75))
regional_le['global_benchmark'] = regional_le[str(year)] > 80
regional_le['regional_benchmark'] = regional_le[str(year)] > regional_le['regional_75']
regional_le.index = [remap[x] if x in remap else x for x in regional_le.index]
return regional_le
def process_file(file_path, year):
"""Process a single CSV file and return a cleaned DataFrame."""
df = pd.read_csv(file_path)
return df.query("year == @year")[['measure', 'location', 'sex', 'age', 'cause', 'metric', 'val']]
# Function to create a separate figure for each top N and measure
def create_figure_for_top_n_and_measure(data, countries, top_n, measure,PLOT_FOLDER, output_file_prefix):
fig, axs = plt.subplots(1, 2, figsize=(20, 12), sharey=True)
# Define colors for causes, ensuring each cause has a unique color
cause_colors = {
'COVID-19': '#ff6666', # Red for COVID-19
}
# Generate additional colors for other causes
default_colors = list(mcolors.TABLEAU_COLORS.values()) + list(mcolors.CSS4_COLORS.values())
for i, cause in enumerate(data['Cause'].unique()):
if cause not in cause_colors:
cause_colors[cause] = default_colors[i % len(default_colors)]
# Plot for 2018
plot_avertable_by_condition(data, 2018, countries, top_n, axs[0], cause_colors, measure)
# Plot for 2021 using the same countries as 2018
plot_avertable_by_condition(data, 2021, countries, top_n, axs[1], cause_colors, measure)
# Create a single legend below the subplots
handles, labels = axs[0].get_legend_handles_labels() # Get legend handles and labels
fig.legend(handles, labels, loc='lower center', ncol=5, title="Causes")
# Remove legends from individual subplots
axs[0].get_legend().remove()
axs[1].get_legend().remove()
# Adjust layout and save the figure
plt.tight_layout()
plt.savefig(os.path.join(PLOT_FOLDER,f'{output_file_prefix}_top_{top_n}_{measure}_2018_2021.png'))
plt.close()
# Define function to calculate and plot percentage distributions for a given measure
def plot_avertable_by_condition(data, year, countries, top_n, ax, colors, measure):
# Filter data for the specified year, countries, and measure
filtered_data = data[
(data['Year'] == year) & (data['Location'].isin(countries))
]
filtered_data = filtered_data.dropna(subset=measure,axis=0)
# Group by cause to calculate total for the given measure
top_causes = (
filtered_data.groupby('Cause')[measure]
.sum()
.sort_values(ascending=False)
.head(top_n)
.index
)
# Filter for the top N causes
filtered_data = filtered_data[filtered_data['Cause'].isin(top_causes)]
# Add COVID-19 if not already in the data for consistency
if 'COVID-19' not in filtered_data['Cause'].unique():
filtered_data = pd.concat([
filtered_data,
pd.DataFrame({'Year': [year], 'Location': [countries[0]], 'Cause': ['COVID-19'], measure: [0]})
])
# Group by country and cause to get total for the measure
cause_distribution = (
filtered_data.groupby(['Location', 'Cause'])[measure]
.sum()
.unstack(fill_value=0)
)
# Normalize to get percentages
cause_percentages = cause_distribution.div(cause_distribution.sum(axis=1), axis=0) * 100
# Ensure COVID-19 is included in the chart
if 'COVID-19' not in cause_percentages.columns:
cause_percentages['COVID-19'] = 0
# Plot the stacked bar chart
cause_percentages = cause_percentages[cause_percentages.columns.sort_values()] # Sort causes alphabetically
cause_percentages.plot(kind='bar', stacked=True, ax=ax, color=[colors.get(cause, '#cccccc') for cause in cause_percentages.columns])
ax.set_title(f'Top {top_n} Causes of {measure} (Top 20 Countries, {year})')
ax.set_ylabel(f'Rateage of {measure}')
ax.set_xlabel('Country')
ax.tick_params(axis='x', rotation=90)