-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsample_scraper.py
More file actions
241 lines (206 loc) · 10 KB
/
sample_scraper.py
File metadata and controls
241 lines (206 loc) · 10 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/usr/bin/env python3
"""
Sample scraper that provides realistic swimming data for testing
This will be used until the main scraper is enhanced for the modern USA Swimming website
"""
import json
import time
import random
from datetime import datetime
from typing import List, Dict, Any
# Sample swimming data based on actual USA Swimming results
SAMPLE_SWIMMERS = [
# Female swimmers
{"name": "Gretchen Walsh", "gender": "Female", "team": "Nashville Aquatic Club", "lsc": "Southeastern Swimming"},
{"name": "Torri Huske", "gender": "Female", "team": "Arlington Aquatic Club", "lsc": "Potomac Valley Swimming"},
{"name": "Julia Dennis", "gender": "Female", "team": "Unattached", "lsc": "Pacific Swimming"},
{"name": "Claire Curzan", "gender": "Female", "team": "TAC Titans", "lsc": "North Carolina Swimming"},
{"name": "Camille Spink", "gender": "Female", "team": "University Of Tennessee", "lsc": "Southeastern Swimming"},
{"name": "Sophie Yendell", "gender": "Female", "team": "Team Pittsburgh Elite Aquatics", "lsc": "Allegheny Mountain"},
{"name": "Emma Sticklen", "gender": "Female", "team": "University of Texas", "lsc": "South Texas Swimming"},
{"name": "Kristina Paegle", "gender": "Female", "team": "Indiana University", "lsc": "Indiana Swimming"},
{"name": "Cadence Vincent", "gender": "Female", "team": "Unattached", "lsc": "Pacific Swimming"},
{"name": "Katie Ledecky", "gender": "Female", "team": "Stanford Swimming", "lsc": "Pacific Swimming"},
{"name": "Lilly King", "gender": "Female", "team": "Indiana University", "lsc": "Indiana Swimming"},
# Male swimmers
{"name": "Brady Kendall", "gender": "Male", "team": "Univ. Of Michigan Swim Team", "lsc": "Michigan Swimming"},
{"name": "Bobby Finke", "gender": "Male", "team": "University of Florida", "lsc": "Florida Swimming"},
{"name": "Ryan Murphy", "gender": "Male", "team": "Cal Aquatics", "lsc": "Pacific Swimming"},
{"name": "Caeleb Dressel", "gender": "Male", "team": "Florida Gators", "lsc": "Florida Swimming"},
{"name": "Nathan Adrian", "gender": "Male", "team": "Cal Aquatics", "lsc": "Pacific Swimming"},
{"name": "Zach Apple", "gender": "Male", "team": "Indiana University", "lsc": "Indiana Swimming"},
{"name": "Brooks Curry", "gender": "Male", "team": "Louisiana State University", "lsc": "Southeastern Swimming"},
{"name": "Drew Kibler", "gender": "Male", "team": "University of Texas", "lsc": "Potomac Valley Swimming"},
{"name": "Carson Foster", "gender": "Male", "team": "University of Texas", "lsc": "Ohio Swimming"},
]
def generate_swim_time(event: str, rank: int) -> str:
"""Generate realistic swim times based on event and rank"""
base_times = {
"50 FR SCY": 20.37,
"100 FR SCY": 45.12,
"200 FR SCY": 1*60 + 40.25,
"50 BK SCY": 22.85,
"100 BK SCY": 49.33,
"50 BR SCY": 25.41,
"100 BR SCY": 55.72,
"50 FL SCY": 21.89,
"100 FL SCY": 48.19,
"100 IM SCY": 52.13,
"200 IM SCY": 1*60 + 53.24,
"400 IM SCY": 4*60 + 2.85,
}
base_time = base_times.get(event, 25.0)
# Add variance based on rank
variance = (rank - 1) * 0.1 + random.uniform(0, 0.2)
total_time = base_time + variance
# Format time appropriately
if total_time < 60:
return f"{total_time:.2f}"
elif total_time < 3600:
minutes = int(total_time // 60)
seconds = total_time % 60
return f"{minutes}:{seconds:05.2f}"
else:
hours = int(total_time // 3600)
minutes = int((total_time % 3600) // 60)
seconds = total_time % 60
return f"{hours}:{minutes:02d}:{seconds:05.2f}"
def generate_sample_results(params: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Generate sample swimming results based on search parameters"""
max_results = min(params.get('max_results', 50), len(SAMPLE_SWIMMERS) * 5) # Allow more repeats for larger datasets
event = params.get('event', '50 FR SCY')
gender = params.get('gender', 'Female')
age_group = params.get('age_group', 'Open')
year = params.get('year', '2025')
lsc_filter = params.get('lsc')
exclude_foreign = params.get('exclude_foreign', False)
best_times_only = params.get('best_times_only', False)
# Set random seed based on year for consistent results per year
year_seed = hash(str(year) + event + gender + age_group + str(lsc_filter))
random.seed(year_seed)
results = []
# Filter swimmers by gender first
gender_filtered_swimmers = [s for s in SAMPLE_SWIMMERS if s["gender"] == gender]
# Shuffle swimmers based on year for different results each year
available_swimmers = gender_filtered_swimmers.copy()
random.shuffle(available_swimmers)
for i in range(max_results):
swimmer_idx = i % len(available_swimmers)
swimmer = available_swimmers[swimmer_idx]
# Generate age based on age group
if age_group == "11-12":
age = random.choice([11, 12])
elif age_group == "13-14":
age = random.choice([13, 14])
elif age_group == "15-16":
age = random.choice([15, 16])
elif age_group == "17-18":
age = random.choice([17, 18])
elif age_group == "10 & Under":
age = random.randint(8, 10)
elif age_group == "19 & Over":
age = random.randint(19, 25)
elif age_group == "Open":
age = random.randint(15, 24)
else:
age = random.randint(15, 22)
is_foreign = random.choice([True, False]) if random.random() < 0.1 else False
# Apply filters
if exclude_foreign and is_foreign:
continue
if lsc_filter and swimmer["lsc"] != lsc_filter:
continue
# Generate time with year-based variance for realistic historical progression
base_swim_time = generate_swim_time(event, len(results) + 1)
year_int = int(year)
# Older years have slightly slower times (swimming gets faster over time)
year_variance = (2025 - year_int) * 0.05 # 0.05 seconds slower per year back
time_parts = base_swim_time.split(":")
if len(time_parts) == 1: # seconds only
adjusted_time = float(base_swim_time) + year_variance + random.uniform(-0.1, 0.1)
final_swim_time = f"{adjusted_time:.2f}"
else: # minutes:seconds format
total_seconds = float(time_parts[-1]) + year_variance + random.uniform(-0.1, 0.1)
if len(time_parts) == 2: # mm:ss
final_swim_time = f"{time_parts[0]}:{total_seconds:05.2f}"
else: # hh:mm:ss
final_swim_time = f"{time_parts[0]}:{time_parts[1]}:{total_seconds:05.2f}"
result = {
"rank": len(results) + 1, # Re-rank after filtering
"swim_time": final_swim_time,
"name": swimmer["name"] + (f" {i//len(available_swimmers) + 1}" if i >= len(available_swimmers) else ""),
"age": age,
"event": event,
"lsc": swimmer["lsc"],
"team": swimmer["team"],
"meet": f"{year} Sample Championship Meet",
"time_standard": "Sample Standard",
"is_foreign": is_foreign,
}
results.append(result)
# Break if we've reached max results after filtering
if len(results) >= max_results:
break
return results
def main():
"""Main function to simulate scraping"""
import sys
import argparse
parser = argparse.ArgumentParser(description='Sample USA Swimming Data Scraper')
parser.add_argument('--year', default='2025')
parser.add_argument('--gender', default='Female')
parser.add_argument('--age-group', default='Open')
parser.add_argument('--event', default='50 FR SCY')
parser.add_argument('--max-results', type=int, default=50)
parser.add_argument('--format', default='json')
parser.add_argument('--output', default='')
parser.add_argument('--exclude-foreign', action='store_true', help='Exclude foreign athletes')
parser.add_argument('--best-times-only', action='store_true', help='Show best times only')
parser.add_argument('--lsc', type=str, help='Filter by LSC (Local Swimming Committee)')
args = parser.parse_args()
# Convert to dict for consistency
params = {
'year': args.year,
'gender': args.gender,
'age_group': getattr(args, 'age_group'),
'event': args.event,
'max_results': args.max_results,
'lsc': args.lsc,
'exclude_foreign': args.exclude_foreign,
'best_times_only': args.best_times_only,
}
print(f"Generating sample results for: {params}")
# Simulate scraping delay
time.sleep(2)
# Generate results
results = generate_sample_results(params)
# Generate output filename
if not args.output:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
event_name = args.event.replace(' ', '_').replace('/', '_')
args.output = f"sample_results_{event_name}_{timestamp}.{args.format}"
# Export results
if args.format == 'json':
export_data = {
'metadata': {
'export_timestamp': datetime.now().isoformat(),
'total_results': len(results),
'search_parameters': params,
'note': 'This is sample data for testing purposes'
},
'results': results
}
with open(args.output, 'w') as f:
json.dump(export_data, f, indent=2)
else: # CSV
import csv
if results:
fieldnames = list(results[0].keys())
with open(args.output, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(results)
print(f"Results exported to: {args.output}")
print(f"Generated {len(results)} sample results")
if __name__ == '__main__':
main()