-
-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathanalyze_piles.py
More file actions
151 lines (130 loc) · 5.02 KB
/
analyze_piles.py
File metadata and controls
151 lines (130 loc) · 5.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python3
"""Analyze piles usage across all blog posts."""
import os
import re
import yaml
from collections import Counter, defaultdict
from pathlib import Path
def extract_front_matter(file_path):
"""Extract YAML front matter from a markdown file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f"Error reading {file_path}: {e}")
return None
# Match front matter between --- delimiters
match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
if match:
try:
yaml_content = match.group(1)
# Try to normalize tabs to spaces for better YAML parsing
yaml_content = yaml_content.replace('\t', ' ')
return yaml.safe_load(yaml_content)
except yaml.YAMLError as e:
# Print first error for debugging
if not hasattr(extract_front_matter, '_printed_error'):
print(f"YAML parse error in {file_path}: {e}")
extract_front_matter._printed_error = True
return None
return None
def analyze_posts():
"""Analyze all posts and their piles."""
posts_dir = Path('_posts')
all_piles = Counter()
pile_combinations = Counter()
posts_by_pile = defaultdict(list)
posts_without_piles = []
post_data = []
total_files = 0
failed_parsing = 0
for md_file in sorted(posts_dir.rglob('*.md')):
total_files += 1
front_matter = extract_front_matter(md_file)
if not front_matter:
failed_parsing += 1
continue
title = front_matter.get('title', 'Untitled')
piles = front_matter.get('piles', [])
date = front_matter.get('date', '')
# Normalize piles to list
if isinstance(piles, str):
piles = [piles]
elif piles is None:
piles = []
post_info = {
'file': str(md_file),
'title': title,
'date': date,
'piles': piles
}
post_data.append(post_info)
if piles:
# Count individual piles
for pile in piles:
all_piles[pile] += 1
posts_by_pile[pile].append(title)
# Count pile combinations
pile_combo = tuple(sorted(piles))
pile_combinations[pile_combo] += 1
else:
posts_without_piles.append((str(md_file), title))
return {
'total_files': total_files,
'failed_parsing': failed_parsing,
'total_posts': len(post_data),
'posts_with_piles': len([p for p in post_data if p['piles']]),
'posts_without_piles': posts_without_piles,
'all_piles': all_piles,
'pile_combinations': pile_combinations,
'posts_by_pile': posts_by_pile,
'post_data': post_data
}
if __name__ == '__main__':
results = analyze_posts()
print("=" * 80)
print("PILE ANALYSIS SUMMARY")
print("=" * 80)
print(f"\nTotal markdown files found: {results['total_files']}")
print(f"Failed to parse front matter: {results['failed_parsing']}")
print(f"Successfully parsed posts: {results['total_posts']}")
print(f"Posts with piles: {results['posts_with_piles']}")
print(f"Posts without piles: {len(results['posts_without_piles'])}")
print(f"Unique piles: {len(results['all_piles'])}")
print("\n" + "=" * 80)
print("PILE USAGE FREQUENCY (sorted by count)")
print("=" * 80)
for pile, count in results['all_piles'].most_common():
print(f"{pile:40} {count:3} posts")
print("\n" + "=" * 80)
print("COMMON PILE COMBINATIONS")
print("=" * 80)
for combo, count in sorted(results['pile_combinations'].items(),
key=lambda x: x[1], reverse=True)[:20]:
if len(combo) > 1:
print(f"{count:3} posts: {', '.join(combo)}")
print("\n" + "=" * 80)
print(f"POSTS WITHOUT PILES ({len(results['posts_without_piles'])} total)")
print("=" * 80)
for file_path, title in results['posts_without_piles'][:10]:
print(f" {title}")
if len(results['posts_without_piles']) > 10:
print(f" ... and {len(results['posts_without_piles']) - 10} more")
# Save detailed data to JSON for further analysis
import json
with open('pile_analysis.json', 'w', encoding='utf-8') as f:
# Convert Counter objects to dicts for JSON serialization
output = {
'summary': {
'total_posts': results['total_posts'],
'posts_with_piles': results['posts_with_piles'],
'posts_without_piles_count': len(results['posts_without_piles'])
},
'all_piles': dict(results['all_piles']),
'pile_combinations': {str(k): v for k, v in results['pile_combinations'].items()},
'posts': results['post_data']
}
json.dump(output, f, indent=2, ensure_ascii=False)
print("\n" + "=" * 80)
print("Detailed analysis saved to pile_analysis.json")
print("=" * 80)