data-analysis-in-python-studies/Project 03.py at main · BrunoPerciani/data-analysis-in-python-studies · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# ============================================
# 1. Task Description
# Analyze Nobel Prize winner data to answer:
# 1) Most commonly awarded gender and birth country.
# 2) Decade with highest ratio of US-born Nobel winners to total winners.
# 3) Decade–category with highest proportion of female laureates.
# 4) First woman to receive a Nobel Prize and in which category.
# 5) Individuals/organizations who have won more than one Nobel Prize.
#
# 2. Topics Covered
# - Reading CSV data with pandas
# - Value counts and Boolean flags
# - Groupby with proportions (mean of Boolean)
# - Decade bucketing
# - Basic faceted line plots (optional visualization)
# ============================================

# 3. Python Script

# Loading in required libraries
import pandas as pd
import seaborn as sns
import numpy as np

# Read in the Nobel Prize data
nobel = pd.read_csv('data/nobel.csv')

# Store and display the most commonly awarded gender and birth country in requested variables
top_gender = nobel['sex'].value_counts().index[0]
top_country = nobel['birth_country'].value_counts().index[0]

print("\n The gender with the most Nobel laureates is :", top_gender)
print(" The most common birth country of Nobel laureates is :", top_country)

# Calculate the proportion of USA born winners per decade
nobel['usa_born_winner'] = nobel['birth_country'] == 'United States of America'
nobel['decade'] = (np.floor(nobel['year'] / 10) * 10).astype(int)
prop_usa_winners = nobel.groupby('decade', as_index=False)['usa_born_winner'].mean()

# Identify the decade with the highest proportion of US-born winners
max_decade_usa = prop_usa_winners[prop_usa_winners['usa_born_winner'] == prop_usa_winners['usa_born_winner'].max()]['decade'].values[0]

# Optional: Plotting USA born winners
ax1 = sns.relplot(x='decade', y='usa_born_winner', data=prop_usa_winners, kind="line")

# Calculating the proportion of female laureates per decade
nobel['female_winner'] = nobel['sex'] == 'Female'
prop_female_winners = nobel.groupby(['decade', 'category'], as_index=False)['female_winner'].mean()

# Find the decade and category with the highest proportion of female laureates
max_female_decade_category = prop_female_winners[prop_female_winners['female_winner'] == prop_female_winners['female_winner'].max()][['decade', 'category']]

# Create a dictionary with the decade and category pair
max_female_dict = {max_female_decade_category['decade'].values[0]: max_female_decade_category['category'].values[0]}

# Optional: Plotting female winners with % winners on the y-axis
ax2 = sns.relplot(x='decade', y='female_winner', hue='category', data=prop_female_winners, kind="line")

# Finding the first woman to win a Nobel Prize
nobel_women = nobel[nobel['female_winner']]
min_row = nobel_women[nobel_women['year'] == nobel_women['year'].min()]
first_woman_name = min_row['full_name'].values[0]
first_woman_category = min_row['category'].values[0]
print(f"\n The first woman to win a Nobel Prize was {first_woman_name}, in the category of {first_woman_category}.")

# Selecting the laureates that have received 2 or more prizes
counts = nobel['full_name'].value_counts()
repeats = counts[counts >= 2].index
repeat_list = list(repeats)

print("\n The repeat winners are :", repeat_list)

# Variables requested by the prompt are:
# top_gender (str), top_country (str), max_decade_usa (int),
# max_female_dict (dict with one key-value pair),
# first_woman_name (str), first_woman_category (str), repeat_list (list of str)

# ============================================
# 4. Additional Notes
# - Proportions are computed as the mean of Boolean flags within each group.
# - Decades are computed by flooring the year to the nearest decade:
#     decade = floor(year/10)*10
# - If multiple rows tie for the maximum in any step, idxmax() will return
#   the first occurrence; this matches the requirement that max_female_dict
#   contains a single (decade → category) pair.
# - Optional relplot() calls produce quick trend visuals, but they are not
#   required to compute the answers.
# ============================================