avertablemortality/utils.py at main · leonchlon/avertablemortality · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import zipfile

DOWNLOAD_FOLDER = './'
DATA_FOLDER = os.path.join(DOWNLOAD_FOLDER, 'data')

measures = ['DALYs (Disability-Adjusted Life Years)',
            'YLDs (Years Lived with Disability)',
            'Deaths',
            'YLLs (Years of Life Lost)']
remap = {"Bolivia, Plurinational State of": 'Bolivia (Plurinational State of)',
                   "Congo, Democratic Republic of the": "Democratic Republic of the Congo",
                  'Iran, Islamic Republic of':'Iran (Islamic Republic of)',
                  "Korea, Democratic People's Republic of":"Democratic People's Republic of Korea",
                   'Korea, Republic of': 'Republic of Korea',
                   'Micronesia, Federated States of':'Micronesia (Federated States of)',
                   'Moldova, Republic of':'Republic of Moldova',
                   'Netherlands, Kingdom of the':'Netherlands',
                   'Tanzania, United Republic of':'United Republic of Tanzania',
                   'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
                   'Venezuela, Bolivarian Republic of':'Venezuela (Bolivarian Republic of)'
                  }

def load_regional_life_expectancy(year):
    LE_path = os.path.join(DATA_FOLDER, 'LifeExpectancy.csv')
    regional_path = os.path.join(DATA_FOLDER, 'all.csv')

    le_benchmarks = pd.read_csv(LE_path, index_col='Country Name')[str(year)]
    regions = pd.read_csv(regional_path,index_col=0)

    regional_le = regions[['region']].merge(le_benchmarks,left_index=True,right_index=True)
    regional_le['regional_75'] = regional_le.groupby('region').transform(lambda x: np.percentile(x,75))
    regional_le['global_benchmark'] = regional_le[str(year)] > 80
    regional_le['regional_benchmark'] = regional_le[str(year)] > regional_le['regional_75']
    regional_le.index = [remap[x] if x in remap else x for x in regional_le.index]
    return regional_le

def process_file(file_path, year):
    """Process a single CSV file and return a cleaned DataFrame."""
    df = pd.read_csv(file_path)
    return df.query("year == @year")[['measure', 'location', 'sex', 'age', 'cause', 'metric', 'val']]

    # Function to create a separate figure for each top N and measure
def create_figure_for_top_n_and_measure(data, countries, top_n, measure,PLOT_FOLDER, output_file_prefix):
    fig, axs = plt.subplots(1, 2, figsize=(20, 12), sharey=True)

    # Define colors for causes, ensuring each cause has a unique color
    cause_colors = {
        'COVID-19': '#ff6666',  # Red for COVID-19
    }
    # Generate additional colors for other causes
    default_colors = list(mcolors.TABLEAU_COLORS.values()) + list(mcolors.CSS4_COLORS.values())
    for i, cause in enumerate(data['Cause'].unique()):
        if cause not in cause_colors:
            cause_colors[cause] = default_colors[i % len(default_colors)]


    # Plot for 2018
    plot_avertable_by_condition(data, 2018, countries, top_n, axs[0], cause_colors, measure)

    # Plot for 2021 using the same countries as 2018
    plot_avertable_by_condition(data, 2021, countries, top_n, axs[1], cause_colors, measure)

    # Create a single legend below the subplots
    handles, labels = axs[0].get_legend_handles_labels()  # Get legend handles and labels
    fig.legend(handles, labels, loc='lower center', ncol=5, title="Causes")

    # Remove legends from individual subplots
    axs[0].get_legend().remove()
    axs[1].get_legend().remove()

    # Adjust layout and save the figure
    plt.tight_layout()
    plt.savefig(os.path.join(PLOT_FOLDER,f'{output_file_prefix}_top_{top_n}_{measure}_2018_2021.png'))
    plt.close()


# Define function to calculate and plot percentage distributions for a given measure
def plot_avertable_by_condition(data, year, countries, top_n, ax, colors, measure):
    # Filter data for the specified year, countries, and measure
    filtered_data = data[
        (data['Year'] == year) & (data['Location'].isin(countries))
    ]
    filtered_data = filtered_data.dropna(subset=measure,axis=0)

    # Group by cause to calculate total for the given measure
    top_causes = (
        filtered_data.groupby('Cause')[measure]
        .sum()
        .sort_values(ascending=False)
        .head(top_n)
        .index
    )

    # Filter for the top N causes
    filtered_data = filtered_data[filtered_data['Cause'].isin(top_causes)]

    # Add COVID-19 if not already in the data for consistency
    if 'COVID-19' not in filtered_data['Cause'].unique():
        filtered_data = pd.concat([
            filtered_data,
            pd.DataFrame({'Year': [year], 'Location': [countries[0]], 'Cause': ['COVID-19'], measure: [0]})
        ])

    # Group by country and cause to get total for the measure
    cause_distribution = (
        filtered_data.groupby(['Location', 'Cause'])[measure]
        .sum()
        .unstack(fill_value=0)
    )

    # Normalize to get percentages
    cause_percentages = cause_distribution.div(cause_distribution.sum(axis=1), axis=0) * 100

    # Ensure COVID-19 is included in the chart
    if 'COVID-19' not in cause_percentages.columns:
        cause_percentages['COVID-19'] = 0

    # Plot the stacked bar chart
    cause_percentages = cause_percentages[cause_percentages.columns.sort_values()]  # Sort causes alphabetically
    cause_percentages.plot(kind='bar', stacked=True, ax=ax, color=[colors.get(cause, '#cccccc') for cause in cause_percentages.columns])
    ax.set_title(f'Top {top_n} Causes of {measure} (Top 20 Countries, {year})')
    ax.set_ylabel(f'Rateage of {measure}')
    ax.set_xlabel('Country')
    ax.tick_params(axis='x', rotation=90)