-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathbasegen.py
More file actions
310 lines (287 loc) · 10.6 KB
/
basegen.py
File metadata and controls
310 lines (287 loc) · 10.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
#!/usr/bin/env python3
import argparse
import fnmatch
import pathlib
import sys
import os
import json
from typing import List, Optional
import pathspec
def load_config() -> dict:
"""
Load configuration settings from an external config.json file located in the current working directory.
If the file is not found, fallback to default settings.
"""
config_path = os.path.join(os.getcwd(), "config.json")
if os.path.exists(config_path):
try:
with open(config_path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception as e:
print(f"Error reading config file '{config_path}': {e}", file=sys.stderr)
sys.exit(1)
else:
# Fallback default settings.
return {
"HARD_CODED_EXCLUDES": [
"*.md",
"*.txt",
"*.png", "*.jpg", "*.jpeg", "*.gif", "*.bmp", "*.tiff", "*.svg",
"*.mp3", "*.wav", "*.ogg", "*.flac",
"package-lock.json"
],
"LANGUAGE_MAPPING": {
".py": "python",
".rs": "rust",
".toml": "toml",
".json": "json",
".env": "bash",
".sh": "bash",
".md": "markdown",
".html": "html",
".css": "css",
".js": "javascript"
}
}
# Load configuration from external JSON file.
config_data = load_config()
def guess_language(ext: str) -> str:
"""
Determine the language for a given file extension.
Uses config_data["LANGUAGE_MAPPING"] if available; otherwise, falls back to a default mapping.
"""
ext = ext.lower()
mapping = config_data.get("LANGUAGE_MAPPING", {})
if mapping:
return mapping.get(ext, '')
default_mapping = {
'.py': 'python',
'.rs': 'rust',
'.toml': 'toml',
'.json': 'json',
'.env': 'bash',
'.sh': 'bash',
'.md': 'markdown',
'.html': 'html',
'.css': 'css',
'.js': 'javascript',
}
return default_mapping.get(ext, '')
def load_gitignore_specs(root: pathlib.Path) -> Optional[pathspec.PathSpec]:
"""
Find all .gitignore files in the given root folder (recursively),
adjust their patterns to be relative to the repository root,
and compile a single PathSpec.
"""
patterns = []
try:
for gitignore in root.rglob(".gitignore"):
try:
lines = gitignore.read_text(encoding="utf-8").splitlines()
except Exception as e:
print(f"Warning: Could not read {gitignore}: {e}", file=sys.stderr)
continue
try:
rel_dir = gitignore.parent.relative_to(root)
except ValueError:
rel_dir = pathlib.Path("")
for line in lines:
line = line.strip()
if not line or line.startswith("#"):
continue
if rel_dir != pathlib.Path(""):
if line.startswith("/"):
pattern = str(rel_dir / line.lstrip("/"))
else:
pattern = str(rel_dir / line)
else:
pattern = line
patterns.append(pattern)
# Always ignore any .gitignore file itself.
patterns.append("**/.gitignore")
return pathspec.PathSpec.from_lines("gitwildmatch", patterns)
except Exception as e:
print(f"Error loading .gitignore specifications: {e}", file=sys.stderr)
return None
def should_include_file(
file: pathlib.Path,
root: pathlib.Path,
include_patterns: Optional[List[str]] = None,
exclude_patterns: Optional[List[str]] = None,
gitignore_spec: Optional[pathspec.PathSpec] = None,
) -> bool:
"""
Decide whether a file should be included based on:
1. Gitignore rules (if provided)
2. CLI include/exclude glob patterns
The patterns are matched against the file’s relative path (using POSIX-style paths).
- If include_patterns is provided, a file is included only if it matches at least one.
- If the file matches any exclude pattern or a gitignore rule, it is omitted.
"""
try:
rel = file.relative_to(root)
except ValueError:
rel = file
rel_str = str(rel).replace(os.sep, "/")
if gitignore_spec and gitignore_spec.match_file(rel_str):
return False
if include_patterns:
if not any(fnmatch.fnmatch(rel_str, pattern) for pattern in include_patterns):
return False
if exclude_patterns:
if any(fnmatch.fnmatch(rel_str, pattern) for pattern in exclude_patterns):
return False
return True
def build_tree(paths: List[pathlib.Path]) -> dict:
"""
Build a nested dictionary representing a directory tree from a list of relative file paths.
Each key is a directory or file name; directories map to further dictionaries.
"""
tree = {}
for path in paths:
parts = path.parts # e.g., ('server', 'src', 'main.rs')
current = tree
for part in parts:
if part not in current:
current[part] = {}
current = current[part]
return tree
def format_tree(tree: dict, indent: str = "") -> List[str]:
"""
Recursively format the nested dictionary tree into a list of strings.
Directories are suffixed with a " >" symbol.
"""
lines = []
for key in sorted(tree.keys()):
if tree[key]:
lines.append(f"{indent}{key} >")
lines.extend(format_tree(tree[key], indent + " "))
else:
lines.append(f"{indent}{key}")
return lines
def generate_markdown(
root_path: pathlib.Path,
output_file: str,
include_patterns: Optional[List[str]] = None,
exclude_patterns: Optional[List[str]] = None,
gitignore_spec: Optional[pathspec.PathSpec] = None,
) -> None:
"""
Generate a Markdown document containing:
1. A directory tree (table of contents) showing only the files that match the filters.
2. For each included file, a section with the file’s path and its contents inside a fenced code block.
The include and exclude patterns are applied relative to the codebase root.
"""
base = root_path.parent
included_files = []
try:
for file in sorted(root_path.rglob("*")):
if file.is_file() and should_include_file(file, root_path, include_patterns, exclude_patterns, gitignore_spec):
try:
rel_file = file.relative_to(base)
except ValueError:
rel_file = file
included_files.append(rel_file)
except Exception as e:
print(f"Error scanning directory '{root_path}': {e}", file=sys.stderr)
sys.exit(1)
if not included_files:
print("Warning: No files found matching the criteria.", file=sys.stderr)
try:
tree_dict = build_tree(included_files)
tree_lines = format_tree(tree_dict)
tree_str = "\n".join(tree_lines)
except Exception as e:
print(f"Error building directory tree: {e}", file=sys.stderr)
sys.exit(1)
md_lines = []
md_lines.append(f"# Codebase: {root_path.name}")
md_lines.append("")
md_lines.append("## Directory Tree")
md_lines.append("")
md_lines.append("```")
md_lines.append(tree_str)
md_lines.append("```")
md_lines.append("")
md_lines.append("## Files")
md_lines.append("")
for rel_path in included_files:
md_lines.append(f"### {rel_path}")
md_lines.append("")
file_path = base / rel_path
ext = file_path.suffix
language = guess_language(ext)
md_lines.append(f"```{language}")
try:
content = file_path.read_text(encoding="utf-8")
except Exception as e:
content = f"Error reading file: {e}"
md_lines.append(content)
md_lines.append("```")
md_lines.append("")
try:
with open(output_file, "w", encoding="utf-8") as f:
f.write("\n".join(md_lines))
print(f"Markdown file generated: {output_file}")
except Exception as e:
print(f"Error writing to output file '{output_file}': {e}", file=sys.stderr)
sys.exit(1)
def main():
parser = argparse.ArgumentParser(
description=(
"Generate a Markdown document documenting a codebase. "
"The document includes a directory tree (as a table of contents) and file contents with syntax highlighting. "
"Optionally, you can provide include/exclude glob patterns (relative to the codebase root) to filter files. "
"Files matching any .gitignore rules or hardcoded exclusions (from config.json) will be omitted. "
"Use --no-gitignore to disable applying .gitignore rules."
)
)
parser.add_argument("input", help="Path to the codebase directory")
parser.add_argument(
"-o", "--output",
default="codebase.md",
help="Output Markdown file (default: codebase.md)"
)
parser.add_argument(
"--include",
nargs="+",
help="Glob pattern(s) for files to include (relative to the codebase root). Only files matching at least one pattern will be included."
)
parser.add_argument(
"--exclude",
nargs="+",
help="Glob pattern(s) for files to exclude (relative to the codebase root). Files matching any of these patterns will be omitted."
)
parser.add_argument(
"--no-gitignore",
action="store_true",
help="Disable applying .gitignore file exclusions."
)
args = parser.parse_args()
root = pathlib.Path(args.input)
if not root.exists() or not root.is_dir():
parser.error(f"The input path '{args.input}' is not a valid directory.")
if args.no_gitignore:
gitignore_spec = None
else:
gitignore_spec = load_gitignore_specs(root)
cli_excludes = args.exclude if args.exclude else []
hardcoded_excludes = config_data.get("HARD_CODED_EXCLUDES", [])
combined_excludes = cli_excludes + hardcoded_excludes
try:
generate_markdown(
root_path=root,
output_file=args.output,
include_patterns=args.include,
exclude_patterns=combined_excludes,
gitignore_spec=gitignore_spec,
)
except Exception as e:
print(f"Unexpected error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
try:
main()
except Exception as err:
print(f"Fatal error: {err}", file=sys.stderr)
sys.exit(1)