-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_analysis.py
More file actions
56 lines (43 loc) · 1.9 KB
/
run_analysis.py
File metadata and controls
56 lines (43 loc) · 1.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# Run this script to generate dataset insights
# Make sure you have the required packages installed:
# pip install pandas matplotlib seaborn numpy
from dataset_insights_analyzer import DatasetInsightsAnalyzer, compare_language_distributions, analyze_patch_complexity
def main():
print("=" * 80)
print("COMPREHENSIVE DATASET ANALYSIS")
print("=" * 80)
# Initialize analyzer
analyzer = DatasetInsightsAnalyzer(data_dir="./data", output_dir="./dataset_insights")
# Option 1: Quick analysis with sample (faster for testing)
# summary, latex_tables = analyzer.run_full_analysis(sample_size=50000)
# Option 2: Full analysis (slower but complete)
print("Running full dataset analysis...")
summary, latex_tables = analyzer.run_full_analysis(sample_size=None)
# Additional detailed analyses
print("\n" + "=" * 60)
print("ADDITIONAL ANALYSES")
print("=" * 60)
# Language comparison
compare_language_distributions(analyzer)
# Load filtered data for complexity analysis
train_df, test_df, val_df = analyzer.load_all_data()
train_filtered = analyzer.filter_python_undefined(analyzer.clean_data(train_df))
train_filtered = analyzer.extract_features(train_filtered)
# Patch complexity analysis
analyze_patch_complexity(train_filtered)
print(f"\n" + "=" * 80)
print("ANALYSIS COMPLETE!")
print("=" * 80)
print(f"Check the './dataset_insights/' folder for:")
print(f" 📊 Visualizations (PNG files)")
print(f" 📋 LaTeX tables (latex_tables.tex)")
print(f" 📈 Summary statistics (dataset_summary.json)")
# Display LaTeX tables for easy copy-paste
print(f"\n" + "=" * 60)
print("LATEX TABLES FOR YOUR PAPER")
print("=" * 60)
for i, table in enumerate(latex_tables, 1):
print(f"\n--- Table {i} ---")
print(table)
if __name__ == "__main__":
main()