-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgroup_parser.py
More file actions
117 lines (95 loc) · 3.93 KB
/
group_parser.py
File metadata and controls
117 lines (95 loc) · 3.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pandas as pd
import re
import os
# ---------------- CONFIG ----------------
RAW_CSV = "data/all_trades_2001_2025_raw.csv"
OUTPUT_CSV = "data/all_player_trades_2001_2025_clean.csv"
# ----------------------------------------
df = pd.read_csv(RAW_CSV)
player_trade_rows = []
trade_counter = 1
# Regex to parse trades
trade_regex = re.compile(
r"(?:In a \d+-team trade, )?The (.*?) traded (.*?) to the (.*?)(?: for (.*?))?(?:;|\.|$)",
re.IGNORECASE
)
def extract_players(asset_text):
"""
Extract only player names from the asset string.
Remove draft picks, cash, stray letters, quotes.
If no players, return ['picks/cash']
"""
if not isinstance(asset_text, str):
return ["picks/cash"]
# Remove draft picks and cash mentions
asset_text = re.sub(r'\d{4}.*round draft pick.*?(\)|$)',
'', asset_text, flags=re.IGNORECASE)
asset_text = re.sub(r'\bcash\b', '', asset_text, flags=re.IGNORECASE)
# Remove all quotes
asset_text = asset_text.replace('"', '').replace("'", "")
# Split by commas and "and"
candidates = re.split(r',| and ', asset_text)
# Strip whitespace and remove meaningless single letters
players = [c.strip()
for c in candidates if c.strip() and len(c.strip()) > 1]
# If nothing left, mark as picks/cash
if not players:
return ["picks/cash"]
return players
# Iterate over each trade row in the raw CSV
for _, row in df.iterrows():
raw_text = row["raw_text"]
date = row["date"]
season = row.get("season", "") # Use season from raw CSV if present
trade_id = f"T{trade_counter:05d}" # unique trade ID
trade_counter += 1
matches = trade_regex.findall(raw_text)
if not matches:
continue
for match in matches:
from_team = match[0].strip()
sent_assets = match[1].strip()
to_team = match[2].strip()
received_assets = match[3].strip() if match[3] else ""
players_sent = extract_players(sent_assets)
players_received = extract_players(received_assets)
# Skip trade if both sides are only picks/cash
if players_sent == ["picks/cash"] and players_received == ["picks/cash"]:
continue
# ----------------------------
# Add rows for team that received players
# Each received player gets all players sent by the other team
players_sent_str = " and ".join(players_sent)
for received_player in players_received:
if received_player.lower() == "picks/cash":
continue # skip rows where only picks/cash were received
player_trade_rows.append({
"season": season,
"date": date,
"trade_id": trade_id,
"team": to_team,
"player_received": received_player,
"players_sent": players_sent_str,
"other_team": from_team
})
# Add rows for team that sent players
# Each sent player is treated as "received" from the other team
players_received_str = " and ".join(players_received)
for sent_player in players_sent:
if sent_player.lower() == "picks/cash":
continue # skip rows where only picks/cash were "received"
player_trade_rows.append({
"season": season,
"date": date,
"trade_id": trade_id,
"team": from_team,
"player_received": sent_player,
"players_sent": players_received_str,
"other_team": to_team
})
# Save cleaned CSV
os.makedirs("data", exist_ok=True)
player_trades_df = pd.DataFrame(player_trade_rows)
player_trades_df.to_csv(OUTPUT_CSV, index=False)
print(f"✅ Player trades saved: {len(player_trades_df)} rows")
print(f"📁 {OUTPUT_CSV}")