-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
72 lines (64 loc) · 3.33 KB
/
data.py
File metadata and controls
72 lines (64 loc) · 3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
def read_json_data(folder_path, encoding = 'utf-8'):
data = []
for filename in os.listdir(folder_path):
if filename.endswith('.json'):
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r', encoding=encoding) as file:
json_data = json.load(file)
data.extend(json_data)
return data
def create_dataframe(data):
return pd.DataFrame(data)
df = create_dataframe(data=read_json_data('spotify_raw_data'))
df = df.dropna(subset=['master_metadata_track_name'])
grouped_bytrack_df = df.groupby('spotify_track_uri').agg(
count = ('spotify_track_uri', 'count'),
total_ms_played = ('ms_played', 'sum'),
master_track_name=('master_metadata_track_name', 'first'),
master_artist_name=('master_metadata_album_artist_name', 'first'),
master_album_name=('master_metadata_album_album_name', 'first')
).reset_index()
top_count_bytrack_df = grouped_bytrack_df.nlargest(100, 'count')
top_ms_played_bytrack_df = grouped_bytrack_df.nlargest(100, 'total_ms_played')
top_count_bytrack_df.to_csv('./tests/top_count_bytrack_output.csv', index=False)
top_ms_played_bytrack_df.to_csv('./tests/top_ms_played_bytrack_output.csv', index=False)
grouped_byartist_df = df.groupby('master_metadata_album_artist_name').agg(
total_ms_played = ('ms_played', 'sum'),
master_artist_name=('master_metadata_album_artist_name', 'first'),
count = ('spotify_track_uri', 'count'),
total_distinct_tracks=('spotify_track_uri', 'nunique')
).reset_index()
top_count_byartist_df = grouped_byartist_df.nlargest(100, 'count')
top_ms_played_byartist_df = grouped_byartist_df.nlargest(100, 'total_ms_played')
top_count_byartist_df.to_csv('./tests/top_count_byartist_output.csv', index=False)
top_ms_played_byartist_df.to_csv('./tests/top_ms_played_byartist_output.csv', index=False)
#Check on specific artist
#radiohead_df = df[df['master_metadata_album_artist_name'] == 'Radiohead']
#sorted_radiohead_df = radiohead_df.groupby('spotify_track_uri').agg(
# master_track_name=('master_metadata_track_name', 'first')
#).reset_index()
#sorted_radiohead_df.to_csv('./tests/radiohead_output.csv', index=False)
df['ts'] = pd.to_datetime(df['ts']) #Convert the ts column into datetime format
df['year_month'] = df['ts'].dt.to_period('M') #Add a new column containing the year & month, e.g. transforming '2015-12-11T18:04:19Z' into '2015-12'
monthly_df = df.groupby('year_month').agg(
total_ms_played=('ms_played', 'sum'),
total_count=('spotify_track_uri', 'count'),
top_track_by_ms_played=('master_metadata_track_name', lambda x: x.value_counts().idxmax()),
top_artist_by_ms_played=('master_metadata_album_artist_name', lambda x: x.value_counts().idxmax())
).reset_index()
monthly_df.to_csv('./tests/monthly_output.csv', index=False)
# Plotting the graph
monthly_df['total_hours_played'] = monthly_df['total_ms_played']/3600000
x_labels = monthly_df['year_month'].astype(str)
plt.figure(figsize=(10, 6))
plt.plot(x_labels, monthly_df['total_hours_played'], marker='o', linestyle='-')
plt.title('Total hours played per month')
plt.xlabel('Month')
plt.ylabel('Total hours played')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()