-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmarkdown_cleaner.py
More file actions
executable file
·354 lines (285 loc) · 14.5 KB
/
markdown_cleaner.py
File metadata and controls
executable file
·354 lines (285 loc) · 14.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
#!/usr/bin/env python3
"""
Markdown Cleaner
This script cleans up Markdown files generated by the scraper:
- Removes duplicate whitespace
- Improves heading structure
- Cleans up code blocks
- Removes navigation elements that might have been captured
- Standardizes formatting
"""
import os
import re
import argparse
import logging
from pathlib import Path
from typing import List, Dict
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("markdown_cleaner.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
class MarkdownCleaner:
def __init__(self, input_dir: str, output_dir: str = None):
self.input_dir = input_dir
self.output_dir = output_dir or input_dir
def clean_directory(self):
"""Process all markdown files in the directory recursively."""
for root, _, files in os.walk(self.input_dir):
for file in files:
if file.endswith('.md'):
input_path = os.path.join(root, file)
rel_path = os.path.relpath(input_path, self.input_dir)
output_path = os.path.join(self.output_dir, rel_path)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
self.clean_file(input_path, output_path)
def clean_file(self, input_path: str, output_path: str):
"""Clean a single markdown file."""
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
# Pre-clean content
content = self.pre_clean_content(content)
# Apply cleaning rules
cleaned_content = self.clean_content(content)
# Write cleaned content
with open(output_path, 'w', encoding='utf-8') as f:
f.write(cleaned_content)
def pre_clean_content(self, content: str) -> str:
"""Pre-clean the content to remove formatting artifacts."""
# Remove navigation elements and other unwanted content
content = re.sub(r'Need help\?.*?Changelog', '', content, flags=re.DOTALL)
content = re.sub(r'Parameters:\s*-\s*$', '', content, flags=re.MULTILINE)
# Remove bold markers and other formatting artifacts
content = re.sub(r'\*\*(?:Category|Description|Examples?):\*\*', '', content)
content = re.sub(r'\*\*.*?\*\*', '', content)
# Remove duplicate sections
content = re.sub(r'(?:^|\n)([^#\n]+?)(?:\s+\1)+', r'\n\1', content)
content = re.sub(r'(?:Category:?\s*[^\n]+\s*){2,}', lambda m: m.group(0).split('\n')[0], content)
# Fix spacing issues
content = re.sub(r'\n{3,}', '\n\n', content)
content = re.sub(r'\.+', '.', content) # Remove multiple periods
# Fix code block markers
content = re.sub(r'```\s*(typescript|javascript)?\s*\n', r'```\1\n', content)
return content
def clean_content(self, content: str) -> str:
"""Apply cleaning rules to the content."""
# Split content into sections
sections = self.split_into_sections(content)
# Clean each section
cleaned_sections = {}
for section_name, section_content in sections.items():
cleaned_sections[section_name] = self.clean_section(section_name, section_content)
# Reconstruct the document
return self.reconstruct_document(cleaned_sections)
def split_into_sections(self, content: str) -> Dict[str, str]:
"""Split the content into logical sections."""
sections = {}
# Extract title
title_match = re.search(r'^#\s*(.+?)(?:\n|$)', content)
if title_match:
sections['title'] = title_match.group(1).strip()
# Extract category
category_match = re.search(r'Category:?\s*([^#\n]+)', content)
if category_match:
sections['category'] = category_match.group(1).strip()
# Extract description
description_match = re.search(r'(?:Description:)?\s*([^#```].*?)(?=\n#|\n```|\Z)', content, re.DOTALL)
if description_match:
desc = description_match.group(1).strip()
# Clean up description
desc = re.sub(r'\s+', ' ', desc)
desc = re.sub(r'Category:.*?(?=\w)', '', desc) # Remove any category text from description
desc = re.sub(r'(?:^|\s)([a-z])', lambda m: m.group(1).upper(), desc) # Capitalize first letter
sections['description'] = desc.strip()
# Extract code examples
code_blocks = []
for match in re.finditer(r'```(?:typescript|javascript)?\n(.*?)\n```', content, re.DOTALL):
code = match.group(1).strip()
if code: # Only add non-empty code blocks
code_blocks.append(code)
if code_blocks:
sections['examples'] = code_blocks
# Extract parameters
params_match = re.search(r'(?:Parameters|Args|Arguments):(.*?)(?=\n#|\n(?:Returns?|Output):|\Z)', content, re.DOTALL)
if params_match:
sections['parameters'] = params_match.group(1).strip()
# Extract return value
returns_match = re.search(r'(?:Returns?|Output):(.*?)(?=\n#|\Z)', content, re.DOTALL)
if returns_match:
sections['returns'] = returns_match.group(1).strip()
return sections
def clean_section(self, section_name: str, content: str) -> str:
"""Clean a specific section based on its type."""
if not content:
return ""
if section_name == 'examples':
return self.clean_code_examples(content)
elif section_name == 'parameters':
return self.clean_parameters(content)
elif section_name == 'returns':
return self.clean_returns(content)
else:
return self.clean_text(content)
def clean_code_examples(self, examples: List[str]) -> str:
"""Clean and format code examples."""
cleaned_examples = []
for example in examples:
# Remove extra whitespace
lines = [line.rstrip() for line in example.split('\n')]
# Remove empty lines at start and end
while lines and not lines[0].strip():
lines.pop(0)
while lines and not lines[-1].strip():
lines.pop()
# Add proper spacing around operators and after commas
lines = [self.format_code_line(line) for line in lines]
# Join lines and ensure proper spacing
cleaned_example = '\n'.join(lines)
cleaned_example = re.sub(r'{\s*\n\s*}', '{ }', cleaned_example) # Fix empty objects
cleaned_examples.append(cleaned_example)
return '\n\n'.join(f'```typescript\n{example}\n```' for example in cleaned_examples)
def format_code_line(self, line: str) -> str:
"""Format a single line of code."""
original_line = line
# Fix comments
line = re.sub(r'/([^/\s])', r'// \1', line) # Fix single-slash comments
line = re.sub(r'//\s*the\s+', r'// The ', line) # Capitalize 'the' in comments
# Fix import statements
line = re.sub(r'import\s*{([^}]+)}\s*from\s*"([^"]+)"', lambda m:
f'import {{ {", ".join(p.strip() for p in m.group(1).split(","))} }} from "{m.group(2)}"', line)
# Fix common spacing issues
line = re.sub(r'(\w+):', r'\1: ', line) # Add space after colons
line = re.sub(r',(\S)', r', \1', line) # Add space after commas
line = re.sub(r'(\S)([=+\-*/])(\S)', r'\1 \2 \3', line) # Add space around operators
# Fix URLs and paths
line = re.sub(r'from\s*"([^"]+)\s*/\s*([^"]+)"', r'from "\1/\2"', line) # Fix import paths
line = re.sub(r'"https:\s*/\s*/', '"https://', line) # Fix https URLs
line = re.sub(r'thirdweb\s*/\s*([^"\s]+)', r'thirdweb/\1', line) # Fix thirdweb imports
# Fix spacing around braces and parentheses
line = re.sub(r'\s*([{(\[])(\s*)', r' \1', line) # Space before opening
line = re.sub(r'(\s*)([})\]])', r'\2', line) # No space before closing
line = re.sub(r'{\s*}', '{ }', line) # Fix empty objects
# Fix bigint literals and hex values
line = re.sub(r'(\d+)n\b', r'\1n', line) # Ensure no space before 'n' in bigint
line = re.sub(r'0x-', '0x', line) # Fix broken hex values
line = re.sub(r'0x\.+', '0x...', line) # Fix broken hex values with dots
# Fix function calls and variable declarations
line = re.sub(r'(\w+)\s*\(\s*{', r'\1({', line)
line = re.sub(r'await\s+(\w+)', r'await \1', line)
line = re.sub(r'const\s+(\w+)\s*=\s*await', r'const \1 = await', line)
line = re.sub(r'const\s+(\w+)\s*=', r'const \1 = ', line)
# Fix TypeScript type annotations
line = re.sub(r':\s*([A-Z]\w+)', r': \1', line) # Add space after type colon
line = re.sub(r'<\s*([^>]+)\s*>', r'<\1>', line) # Remove spaces in generics
# Clean up any remaining multiple spaces and fix spacing
line = re.sub(r'\s+', ' ', line)
line = line.strip()
# If the line was significantly changed, log it for debugging
if line != original_line:
logger.debug(f"Cleaned line:\nFrom: {original_line}\nTo: {line}")
return line
def clean_parameters(self, params: str) -> str:
"""Clean and format parameters section."""
if not params.strip():
return ""
lines = params.split('\n')
cleaned_lines = []
current_param = None
for line in lines:
line = line.strip()
if line:
# Format parameter definitions
param_match = re.match(r'^[-*]?\s*`?(\w+)`?\s*(?:\((.*?)\))?\s*(?::|-)?\s*(.*)$', line)
if param_match:
name, type_info, description = param_match.groups()
# Clean up the description
description = re.sub(r'\s+', ' ', description).strip()
description = description.capitalize()
type_str = f" (`{type_info}`)" if type_info else ""
cleaned_lines.append(f"- `{name}`{type_str}: {description}")
current_param = name
else:
# Try to extract parameter info from other formats
alt_match = re.match(r'^[-*]?\s*(\w+)\s*[-:]\s*(.+)$', line)
if alt_match:
name, description = alt_match.groups()
description = re.sub(r'\s+', ' ', description).strip()
description = description.capitalize()
cleaned_lines.append(f"- `{name}`: {description}")
current_param = name
elif current_param: # This line is a continuation of the previous parameter
# Add the line to the previous parameter's description
cleaned_lines[-1] = cleaned_lines[-1] + " " + line
if cleaned_lines:
return "\n\n### Parameters\n\n" + '\n'.join(cleaned_lines)
return ""
def clean_returns(self, returns: str) -> str:
"""Clean and format returns section."""
if not returns.strip():
return ""
cleaned_returns = returns.strip()
# Format return type if present
return_match = re.match(r'^`?(\w+)`?\s*(?:\((.*?)\))?\s*(?::|-)?\s*(.*)$', cleaned_returns)
if return_match:
type_name, type_info, description = return_match.groups()
type_str = f"`{type_info or type_name}`"
cleaned_returns = f"{type_str}: {description}"
return "\n\n### Returns\n\n" + cleaned_returns
def clean_text(self, text: str) -> str:
"""Clean and format regular text."""
if not text:
return ""
# Remove duplicate text
text = re.sub(r'(\b\w+\b)(\s+\1)+', r'\1', text)
# Remove navigation elements
text = re.sub(r'Need help\?.*?Changelog', '', text, flags=re.DOTALL)
# Fix spacing
text = re.sub(r'\s+', ' ', text) # Normalize whitespace
text = text.strip()
# Fix sentence spacing and punctuation
text = re.sub(r'(?<=[.!?])\s*(?=[A-Z])', '. ', text) # Double space after sentences
text = re.sub(r'(?<=\w),(?=\w)', ', ', text) # Add space after commas
text = re.sub(r'(?<=\w)\.(?=\w)', '. ', text) # Add space after periods
text = re.sub(r'\s+([.,:;])', r'\1', text) # Remove space before punctuation
text = re.sub(r'\.+', '.', text) # Remove multiple periods
# Capitalize sentences and fix spacing
text = re.sub(r'(?:^|(?<=[.!?])\s+)([a-z])', lambda m: m.group(1).upper(), text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def reconstruct_document(self, sections: Dict[str, str]) -> str:
"""Reconstruct the document from cleaned sections."""
doc_parts = []
# Title
if 'title' in sections:
doc_parts.append(f"# {sections['title']}\n")
# Category
if 'category' in sections:
doc_parts.append(f"Category: {sections['category']}\n")
# Description
if 'description' in sections:
doc_parts.append(f"\n{sections['description']}\n")
# Examples
if 'examples' in sections:
doc_parts.append("\n### Examples\n")
doc_parts.append(sections['examples'])
# Parameters
if 'parameters' in sections:
doc_parts.append(sections['parameters'])
# Returns
if 'returns' in sections:
doc_parts.append(sections['returns'])
return '\n'.join(doc_parts)
def main():
parser = argparse.ArgumentParser(description='Clean and standardize markdown documentation files.')
parser.add_argument('input_dir', help='Input directory containing markdown files')
parser.add_argument('--output-dir', help='Output directory for cleaned files (default: same as input)')
args = parser.parse_args()
cleaner = MarkdownCleaner(args.input_dir, args.output_dir)
cleaner.clean_directory()
if __name__ == '__main__':
main()