-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathexample_usage.py
More file actions
156 lines (121 loc) · 5.22 KB
/
example_usage.py
File metadata and controls
156 lines (121 loc) · 5.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python3
"""
Example usage of CSV Processor for GSSOC 25 Dashboard Update
This script demonstrates how to use the CSV processor in practice
for the database migration task.
"""
from csv_processor import CSVProcessor
import json
def process_gssoc25_projects():
"""
Process GSSOC 25 projects from CSV file
This is the main function that would be used in the database migration script
"""
print("Processing GSSOC 25 Projects from CSV")
print("=" * 50)
# Initialize processor
csv_file = "Master sheet - Mapping.csv"
processor = CSVProcessor(csv_file)
# Process CSV with URL validation but without accessibility check
# (accessibility check can be slow and is optional)
projects = processor.process_csv(
validate_urls=True,
check_accessibility=False
)
# Get valid projects for database insertion
valid_projects = processor.get_valid_projects()
invalid_projects = processor.get_invalid_projects()
print(f"✓ Successfully processed {len(projects)} projects")
print(f"✓ Valid projects ready for database: {len(valid_projects)}")
print(f"⚠ Projects with issues: {len(invalid_projects)}")
print()
# Export data in format ready for MongoDB insertion
db_ready_data = processor.export_to_dict(include_invalid=False)
# Show some statistics
print("Project Statistics:")
print("-" * 20)
# Tech stack analysis
tech_stacks = {}
for project in valid_projects:
if project.tech_stack:
# Simple keyword extraction
stack = project.tech_stack.lower()
if 'react' in stack:
tech_stacks['React'] = tech_stacks.get('React', 0) + 1
if 'node' in stack or 'nodejs' in stack:
tech_stacks['Node.js'] = tech_stacks.get('Node.js', 0) + 1
if 'python' in stack:
tech_stacks['Python'] = tech_stacks.get('Python', 0) + 1
if 'java' in stack and 'javascript' not in stack:
tech_stacks['Java'] = tech_stacks.get('Java', 0) + 1
print("Popular Tech Stacks:")
for tech, count in sorted(tech_stacks.items(), key=lambda x: x[1], reverse=True)[:5]:
print(f" {tech}: {count} projects")
print()
# Mentor statistics
total_mentors = sum(len(project.mentors) for project in valid_projects)
avg_mentors = total_mentors / len(valid_projects) if valid_projects else 0
print(f"Total mentors: {total_mentors}")
print(f"Average mentors per project: {avg_mentors:.1f}")
print()
return db_ready_data, invalid_projects
def show_sample_projects(db_data, count=3):
"""Show sample projects that will be inserted into database"""
print(f"Sample Projects (first {count}):")
print("-" * 30)
for i, project in enumerate(db_data[:count]):
print(f"{i+1}. {project['project_name']}")
print(f" GitHub: {project['github_url']}")
print(f" Admin: {project['admin_name']}")
print(f" Tech: {project['tech_stack'][:100]}{'...' if len(project['tech_stack']) > 100 else ''}")
print(f" Mentors: {len(project['mentors'])}")
print()
def show_problematic_projects(invalid_projects, count=5):
"""Show projects that need manual review"""
if not invalid_projects:
print("✓ No problematic projects found!")
return
print(f"Projects Requiring Manual Review (first {count}):")
print("-" * 45)
for i, project in enumerate(invalid_projects[:count]):
print(f"{i+1}. {project.project_name}")
print(f" URL: {project.github_url}")
print(f" Issues: {', '.join(project.validation_errors)}")
print()
def save_results(db_data, invalid_projects):
"""Save processing results to files"""
# Save valid projects for database insertion
with open('gssoc25_projects_valid.json', 'w', encoding='utf-8') as f:
json.dump(db_data, f, indent=2, ensure_ascii=False)
# Save invalid projects for manual review
invalid_data = []
for project in invalid_projects:
invalid_data.append({
'project_name': project.project_name,
'github_url': project.github_url,
'admin_name': project.admin_name,
'issues': project.validation_errors
})
with open('gssoc25_projects_invalid.json', 'w', encoding='utf-8') as f:
json.dump(invalid_data, f, indent=2, ensure_ascii=False)
print("Results saved:")
print(f"✓ Valid projects: gssoc25_projects_valid.json ({len(db_data)} projects)")
print(f"⚠ Invalid projects: gssoc25_projects_invalid.json ({len(invalid_projects)} projects)")
def main():
"""Main function"""
try:
# Process the CSV file
db_data, invalid_projects = process_gssoc25_projects()
# Show samples
show_sample_projects(db_data)
show_problematic_projects(invalid_projects)
# Save results
save_results(db_data, invalid_projects)
print("\n" + "=" * 50)
print("CSV Processing Complete!")
print("Ready for database migration (Task 2)")
except Exception as e:
print(f"Error: {str(e)}")
raise
if __name__ == "__main__":
main()