ThirdwebDocsScraper/markdown_cleaner.py at main · cobibean/ThirdwebDocsScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
#!/usr/bin/env python3
"""
Markdown Cleaner

This script cleans up Markdown files generated by the scraper:
- Removes duplicate whitespace
- Improves heading structure
- Cleans up code blocks
- Removes navigation elements that might have been captured
- Standardizes formatting
"""

import os
import re
import argparse
import logging
from pathlib import Path
from typing import List, Dict

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("markdown_cleaner.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class MarkdownCleaner:
    def __init__(self, input_dir: str, output_dir: str = None):
        self.input_dir = input_dir
        self.output_dir = output_dir or input_dir

    def clean_directory(self):
        """Process all markdown files in the directory recursively."""
        for root, _, files in os.walk(self.input_dir):
            for file in files:
                if file.endswith('.md'):
                    input_path = os.path.join(root, file)
                    rel_path = os.path.relpath(input_path, self.input_dir)
                    output_path = os.path.join(self.output_dir, rel_path)
                    os.makedirs(os.path.dirname(output_path), exist_ok=True)
                    self.clean_file(input_path, output_path)

    def clean_file(self, input_path: str, output_path: str):
        """Clean a single markdown file."""
        with open(input_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Pre-clean content
        content = self.pre_clean_content(content)

        # Apply cleaning rules
        cleaned_content = self.clean_content(content)

        # Write cleaned content
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_content)

    def pre_clean_content(self, content: str) -> str:
        """Pre-clean the content to remove formatting artifacts."""
        # Remove navigation elements and other unwanted content
        content = re.sub(r'Need help\?.*?Changelog', '', content, flags=re.DOTALL)
        content = re.sub(r'Parameters:\s*-\s*$', '', content, flags=re.MULTILINE)

        # Remove bold markers and other formatting artifacts
        content = re.sub(r'\*\*(?:Category|Description|Examples?):\*\*', '', content)
        content = re.sub(r'\*\*.*?\*\*', '', content)

        # Remove duplicate sections
        content = re.sub(r'(?:^|\n)([^#\n]+?)(?:\s+\1)+', r'\n\1', content)
        content = re.sub(r'(?:Category:?\s*[^\n]+\s*){2,}', lambda m: m.group(0).split('\n')[0], content)

        # Fix spacing issues
        content = re.sub(r'\n{3,}', '\n\n', content)
        content = re.sub(r'\.+', '.', content)  # Remove multiple periods

        # Fix code block markers
        content = re.sub(r'```\s*(typescript|javascript)?\s*\n', r'```\1\n', content)

        return content

    def clean_content(self, content: str) -> str:
        """Apply cleaning rules to the content."""
        # Split content into sections
        sections = self.split_into_sections(content)

        # Clean each section
        cleaned_sections = {}
        for section_name, section_content in sections.items():
            cleaned_sections[section_name] = self.clean_section(section_name, section_content)

        # Reconstruct the document
        return self.reconstruct_document(cleaned_sections)

    def split_into_sections(self, content: str) -> Dict[str, str]:
        """Split the content into logical sections."""
        sections = {}

        # Extract title
        title_match = re.search(r'^#\s*(.+?)(?:\n|$)', content)
        if title_match:
            sections['title'] = title_match.group(1).strip()

        # Extract category
        category_match = re.search(r'Category:?\s*([^#\n]+)', content)
        if category_match:
            sections['category'] = category_match.group(1).strip()

        # Extract description
        description_match = re.search(r'(?:Description:)?\s*([^#```].*?)(?=\n#|\n```|\Z)', content, re.DOTALL)
        if description_match:
            desc = description_match.group(1).strip()
            # Clean up description
            desc = re.sub(r'\s+', ' ', desc)
            desc = re.sub(r'Category:.*?(?=\w)', '', desc)  # Remove any category text from description
            desc = re.sub(r'(?:^|\s)([a-z])', lambda m: m.group(1).upper(), desc)  # Capitalize first letter
            sections['description'] = desc.strip()

        # Extract code examples
        code_blocks = []
        for match in re.finditer(r'```(?:typescript|javascript)?\n(.*?)\n```', content, re.DOTALL):
            code = match.group(1).strip()
            if code:  # Only add non-empty code blocks
                code_blocks.append(code)
        if code_blocks:
            sections['examples'] = code_blocks

        # Extract parameters
        params_match = re.search(r'(?:Parameters|Args|Arguments):(.*?)(?=\n#|\n(?:Returns?|Output):|\Z)', content, re.DOTALL)
        if params_match:
            sections['parameters'] = params_match.group(1).strip()

        # Extract return value
        returns_match = re.search(r'(?:Returns?|Output):(.*?)(?=\n#|\Z)', content, re.DOTALL)
        if returns_match:
            sections['returns'] = returns_match.group(1).strip()

        return sections

    def clean_section(self, section_name: str, content: str) -> str:
        """Clean a specific section based on its type."""
        if not content:
            return ""

        if section_name == 'examples':
            return self.clean_code_examples(content)
        elif section_name == 'parameters':
            return self.clean_parameters(content)
        elif section_name == 'returns':
            return self.clean_returns(content)
        else:
            return self.clean_text(content)

    def clean_code_examples(self, examples: List[str]) -> str:
        """Clean and format code examples."""
        cleaned_examples = []
        for example in examples:
            # Remove extra whitespace
            lines = [line.rstrip() for line in example.split('\n')]
            # Remove empty lines at start and end
            while lines and not lines[0].strip():
                lines.pop(0)
            while lines and not lines[-1].strip():
                lines.pop()

            # Add proper spacing around operators and after commas
            lines = [self.format_code_line(line) for line in lines]

            # Join lines and ensure proper spacing
            cleaned_example = '\n'.join(lines)
            cleaned_example = re.sub(r'{\s*\n\s*}', '{ }', cleaned_example)  # Fix empty objects
            cleaned_examples.append(cleaned_example)

        return '\n\n'.join(f'```typescript\n{example}\n```' for example in cleaned_examples)

    def format_code_line(self, line: str) -> str:
        """Format a single line of code."""
        original_line = line

        # Fix comments
        line = re.sub(r'/([^/\s])', r'// \1', line)  # Fix single-slash comments
        line = re.sub(r'//\s*the\s+', r'// The ', line)  # Capitalize 'the' in comments

        # Fix import statements
        line = re.sub(r'import\s*{([^}]+)}\s*from\s*"([^"]+)"', lambda m:
            f'import {{ {", ".join(p.strip() for p in m.group(1).split(","))} }} from "{m.group(2)}"', line)

        # Fix common spacing issues
        line = re.sub(r'(\w+):', r'\1: ', line)  # Add space after colons
        line = re.sub(r',(\S)', r', \1', line)  # Add space after commas
        line = re.sub(r'(\S)([=+\-*/])(\S)', r'\1 \2 \3', line)  # Add space around operators

        # Fix URLs and paths
        line = re.sub(r'from\s*"([^"]+)\s*/\s*([^"]+)"', r'from "\1/\2"', line)  # Fix import paths
        line = re.sub(r'"https:\s*/\s*/', '"https://', line)  # Fix https URLs
        line = re.sub(r'thirdweb\s*/\s*([^"\s]+)', r'thirdweb/\1', line)  # Fix thirdweb imports

        # Fix spacing around braces and parentheses
        line = re.sub(r'\s*([{(\[])(\s*)', r' \1', line)  # Space before opening
        line = re.sub(r'(\s*)([})\]])', r'\2', line)  # No space before closing
        line = re.sub(r'{\s*}', '{ }', line)  # Fix empty objects

        # Fix bigint literals and hex values
        line = re.sub(r'(\d+)n\b', r'\1n', line)  # Ensure no space before 'n' in bigint
        line = re.sub(r'0x-', '0x', line)  # Fix broken hex values
        line = re.sub(r'0x\.+', '0x...', line)  # Fix broken hex values with dots

        # Fix function calls and variable declarations
        line = re.sub(r'(\w+)\s*\(\s*{', r'\1({', line)
        line = re.sub(r'await\s+(\w+)', r'await \1', line)
        line = re.sub(r'const\s+(\w+)\s*=\s*await', r'const \1 = await', line)
        line = re.sub(r'const\s+(\w+)\s*=', r'const \1 = ', line)

        # Fix TypeScript type annotations
        line = re.sub(r':\s*([A-Z]\w+)', r': \1', line)  # Add space after type colon
        line = re.sub(r'<\s*([^>]+)\s*>', r'<\1>', line)  # Remove spaces in generics

        # Clean up any remaining multiple spaces and fix spacing
        line = re.sub(r'\s+', ' ', line)
        line = line.strip()

        # If the line was significantly changed, log it for debugging
        if line != original_line:
            logger.debug(f"Cleaned line:\nFrom: {original_line}\nTo:   {line}")

        return line

    def clean_parameters(self, params: str) -> str:
        """Clean and format parameters section."""
        if not params.strip():
            return ""

        lines = params.split('\n')
        cleaned_lines = []
        current_param = None

        for line in lines:
            line = line.strip()
            if line:
                # Format parameter definitions
                param_match = re.match(r'^[-*]?\s*`?(\w+)`?\s*(?:\((.*?)\))?\s*(?::|-)?\s*(.*)$', line)
                if param_match:
                    name, type_info, description = param_match.groups()
                    # Clean up the description
                    description = re.sub(r'\s+', ' ', description).strip()
                    description = description.capitalize()
                    type_str = f" (`{type_info}`)" if type_info else ""
                    cleaned_lines.append(f"- `{name}`{type_str}: {description}")
                    current_param = name
                else:
                    # Try to extract parameter info from other formats
                    alt_match = re.match(r'^[-*]?\s*(\w+)\s*[-:]\s*(.+)$', line)
                    if alt_match:
                        name, description = alt_match.groups()
                        description = re.sub(r'\s+', ' ', description).strip()
                        description = description.capitalize()
                        cleaned_lines.append(f"- `{name}`: {description}")
                        current_param = name
                    elif current_param:  # This line is a continuation of the previous parameter
                        # Add the line to the previous parameter's description
                        cleaned_lines[-1] = cleaned_lines[-1] + " " + line

        if cleaned_lines:
            return "\n\n### Parameters\n\n" + '\n'.join(cleaned_lines)
        return ""

    def clean_returns(self, returns: str) -> str:
        """Clean and format returns section."""
        if not returns.strip():
            return ""

        cleaned_returns = returns.strip()
        # Format return type if present
        return_match = re.match(r'^`?(\w+)`?\s*(?:\((.*?)\))?\s*(?::|-)?\s*(.*)$', cleaned_returns)
        if return_match:
            type_name, type_info, description = return_match.groups()
            type_str = f"`{type_info or type_name}`"
            cleaned_returns = f"{type_str}: {description}"

        return "\n\n### Returns\n\n" + cleaned_returns

    def clean_text(self, text: str) -> str:
        """Clean and format regular text."""
        if not text:
            return ""

        # Remove duplicate text
        text = re.sub(r'(\b\w+\b)(\s+\1)+', r'\1', text)

        # Remove navigation elements
        text = re.sub(r'Need help\?.*?Changelog', '', text, flags=re.DOTALL)

        # Fix spacing
        text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
        text = text.strip()

        # Fix sentence spacing and punctuation
        text = re.sub(r'(?<=[.!?])\s*(?=[A-Z])', '.  ', text)  # Double space after sentences
        text = re.sub(r'(?<=\w),(?=\w)', ', ', text)  # Add space after commas
        text = re.sub(r'(?<=\w)\.(?=\w)', '. ', text)  # Add space after periods
        text = re.sub(r'\s+([.,:;])', r'\1', text)  # Remove space before punctuation
        text = re.sub(r'\.+', '.', text)  # Remove multiple periods

        # Capitalize sentences and fix spacing
        text = re.sub(r'(?:^|(?<=[.!?])\s+)([a-z])', lambda m: m.group(1).upper(), text)
        text = re.sub(r'\s+', ' ', text)

        return text.strip()

    def reconstruct_document(self, sections: Dict[str, str]) -> str:
        """Reconstruct the document from cleaned sections."""
        doc_parts = []

        # Title
        if 'title' in sections:
            doc_parts.append(f"# {sections['title']}\n")

        # Category
        if 'category' in sections:
            doc_parts.append(f"Category: {sections['category']}\n")

        # Description
        if 'description' in sections:
            doc_parts.append(f"\n{sections['description']}\n")

        # Examples
        if 'examples' in sections:
            doc_parts.append("\n### Examples\n")
            doc_parts.append(sections['examples'])

        # Parameters
        if 'parameters' in sections:
            doc_parts.append(sections['parameters'])

        # Returns
        if 'returns' in sections:
            doc_parts.append(sections['returns'])

        return '\n'.join(doc_parts)

def main():
    parser = argparse.ArgumentParser(description='Clean and standardize markdown documentation files.')
    parser.add_argument('input_dir', help='Input directory containing markdown files')
    parser.add_argument('--output-dir', help='Output directory for cleaned files (default: same as input)')
    args = parser.parse_args()

    cleaner = MarkdownCleaner(args.input_dir, args.output_dir)
    cleaner.clean_directory()

if __name__ == '__main__':
    main()