-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit_markdown.py
More file actions
180 lines (136 loc) · 5.21 KB
/
split_markdown.py
File metadata and controls
180 lines (136 loc) · 5.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python3
"""
Markdown Document Splitter
This script splits a Markdown file into multiple sub-documents based on
top-level headings (lines starting with a single '#').
"""
import argparse
import re
import sys
from pathlib import Path
from typing import List, Tuple
def parse_markdown(content: str) -> List[Tuple[str, str]]:
"""
Parse markdown content and split by top-level headings.
Args:
content: The markdown file content
Returns:
List of tuples containing (heading_text, section_content)
"""
sections = []
lines = content.split('\n')
current_heading = None
current_content = []
for line in lines:
# Check if line is a top-level heading (starts with single #)
if re.match(r'^#\s+', line):
# Save previous section if exists
if current_heading is not None:
sections.append((current_heading, '\n'.join(current_content)))
# Start new section
current_heading = line[2:].strip() # Remove '# ' prefix
current_content = [line] # Include the heading in content
else:
# Add line to current section
if current_heading is not None:
current_content.append(line)
# If no heading yet, this is preamble content, skip it
# or you could save it as a separate "intro" section
# Don't forget the last section
if current_heading is not None:
sections.append((current_heading, '\n'.join(current_content)))
return sections
def sanitize_filename(text: str) -> str:
"""
Convert heading text to a safe filename.
Args:
text: The heading text
Returns:
Sanitized filename
"""
# Remove or replace invalid filename characters
# Keep Chinese characters, letters, numbers, spaces, and basic punctuation
text = re.sub(r'[<>:"/\\|?*]', '', text)
text = text.strip()
# Limit length to avoid filesystem issues
if len(text) > 100:
text = text[:100]
return text
def split_markdown_file(
input_file: Path,
output_dir: Path = None,
) -> None:
"""
Split a markdown file into multiple files based on top-level headings.
Args:
input_file: Path to the input markdown file
output_dir: Optional output directory (default: same as input with input filename)
Raises:
FileNotFoundError: If input file doesn't exist
ValueError: If input file is not a markdown file
"""
if not input_file.exists():
raise FileNotFoundError(f"Input file not found: {input_file}")
if input_file.suffix.lower() not in ['.md', '.markdown']:
raise ValueError(f"Input file must be a markdown file: {input_file}")
# Determine output directory
if output_dir is None:
# Create folder with same name as input file (without extension)
output_dir = input_file.parent / input_file.stem
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Reading: {input_file}")
# Read input file
content = input_file.read_text(encoding='utf-8')
# Parse and split
sections = parse_markdown(content)
if not sections:
print("Warning: No top-level headings found in the document")
return
print(f"Found {len(sections)} sections")
print(f"Output directory: {output_dir}")
# Write each section to a separate file
for idx, (heading, section_content) in enumerate(sections, 1):
# Create filename from heading
filename = sanitize_filename(heading)
# Add index prefix to maintain order
output_file = output_dir / f"{idx:02d}_{filename}.md"
# Write file
output_file.write_text(section_content, encoding='utf-8')
print(f" Created: {output_file.name}")
print(f"\n✓ Successfully split into {len(sections)} files")
def main() -> int:
"""Main entry point for the script."""
parser = argparse.ArgumentParser(
description="Split Markdown file by top-level headings",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"input_file",
type=Path,
help="Path to the markdown file to split",
)
parser.add_argument(
"-o",
"--output",
type=Path,
help="Output directory (default: folder named after input file)",
)
args = parser.parse_args()
try:
split_markdown_file(
input_file=args.input_file,
output_dir=args.output,
)
return 0
except (FileNotFoundError, ValueError) as e:
print(f"Error: {e}", file=sys.stderr)
return 1
except KeyboardInterrupt:
print("\nOperation cancelled by user", file=sys.stderr)
return 130
except Exception as e:
print(f"Unexpected error: {e}", file=sys.stderr)
return 1
if __name__ == "__main__":
sys.exit(main())