|
| 1 | +const fs = require('fs'); |
| 2 | +const path = require('path'); |
| 3 | + |
| 4 | +/** |
| 5 | + * Safely read file with path traversal protection |
| 6 | + * @param {string} filePath - Path to file to read |
| 7 | + * @returns {string|null} File content or null if error/invalid path |
| 8 | + */ |
| 9 | +function readFile(filePath) { |
| 10 | + // Whitelist: only allow relative paths within current directory |
| 11 | + const normalizedPath = path.normalize(filePath); |
| 12 | + |
| 13 | + if (path.isAbsolute(normalizedPath) || normalizedPath.includes('..')) { |
| 14 | + console.log(`Invalid path: ${filePath}`); |
| 15 | + return null; |
| 16 | + } |
| 17 | + |
| 18 | + try { |
| 19 | + return fs.readFileSync(normalizedPath, 'utf8'); |
| 20 | + } catch (error) { |
| 21 | + console.log(`Error reading file ${filePath}: ${error.message}`); |
| 22 | + return null; |
| 23 | + } |
| 24 | +} |
| 25 | + |
| 26 | +/** |
| 27 | + * Extract internal markdown links from content |
| 28 | + * Matches patterns like [text](./file.md) or [text](../file.md) or [text](file.md) |
| 29 | + * @param {string} content - The markdown content to scan for links |
| 30 | + * @param {string} basePath - Base directory path for resolving relative links |
| 31 | + * @returns {Array} Array of link objects with text, path, and resolvedPath |
| 32 | + */ |
| 33 | +function extractInternalLinks(content, basePath) { |
| 34 | + const linkRegex = /\[([^\]]+)\]\(([^)]+)\)/g; |
| 35 | + const internalLinks = []; |
| 36 | + let match; |
| 37 | + |
| 38 | + while ((match = linkRegex.exec(content)) !== null) { |
| 39 | + const linkText = match[1]; |
| 40 | + const linkPath = match[2]; |
| 41 | + |
| 42 | + // Check if it's an internal link (not http/https and ends with .md) |
| 43 | + if (!linkPath.startsWith('http') && linkPath.endsWith('.md')) { |
| 44 | + const resolvedPath = path.join(basePath, linkPath); |
| 45 | + internalLinks.push({ |
| 46 | + text: linkText, |
| 47 | + path: linkPath, |
| 48 | + resolvedPath: resolvedPath |
| 49 | + }); |
| 50 | + } |
| 51 | + } |
| 52 | + |
| 53 | + return internalLinks; |
| 54 | +} |
| 55 | + |
| 56 | +/** |
| 57 | + * Read markdown file and follow internal links |
| 58 | + * @param {string} filePath - Path to the markdown file |
| 59 | + * @param {Object} options - Configuration options |
| 60 | + * @param {boolean} options.followLinks - Whether to follow internal links (default: true) |
| 61 | + * @param {number} options.maxDepth - Maximum depth to follow links (default: 3) |
| 62 | + * @param {Set} options.visited - Internal set to track visited files (prevent cycles) |
| 63 | + * @param {number} options.currentDepth - Current depth (internal) |
| 64 | + * @returns {Object} Object containing content and linked files |
| 65 | + */ |
| 66 | +function readMarkdown(filePath, options = {}) { |
| 67 | + const { |
| 68 | + followLinks = true, |
| 69 | + maxDepth = 3, |
| 70 | + visited = new Set(), |
| 71 | + currentDepth = 0 |
| 72 | + } = options; |
| 73 | + |
| 74 | + const normalizedPath = path.normalize(filePath); |
| 75 | + |
| 76 | + // Check if we've already visited this file (prevent cycles) |
| 77 | + if (visited.has(normalizedPath)) { |
| 78 | + return { |
| 79 | + path: normalizedPath, |
| 80 | + content: null, |
| 81 | + error: 'Circular reference detected', |
| 82 | + linkedFiles: [] |
| 83 | + }; |
| 84 | + } |
| 85 | + |
| 86 | + // Check depth limit |
| 87 | + if (currentDepth >= maxDepth) { |
| 88 | + return { |
| 89 | + path: normalizedPath, |
| 90 | + content: readFile(normalizedPath), |
| 91 | + error: null, |
| 92 | + linkedFiles: [], |
| 93 | + depthLimitReached: true |
| 94 | + }; |
| 95 | + } |
| 96 | + |
| 97 | + // Mark this file as visited |
| 98 | + visited.add(normalizedPath); |
| 99 | + |
| 100 | + // Read the main file content |
| 101 | + const content = readFile(normalizedPath); |
| 102 | + if (content === null) { |
| 103 | + return { |
| 104 | + path: normalizedPath, |
| 105 | + content: null, |
| 106 | + error: 'File not found or could not be read', |
| 107 | + linkedFiles: [] |
| 108 | + }; |
| 109 | + } |
| 110 | + |
| 111 | + const result = { |
| 112 | + path: normalizedPath, |
| 113 | + content: content, |
| 114 | + error: null, |
| 115 | + linkedFiles: [] |
| 116 | + }; |
| 117 | + |
| 118 | + // If we should follow links, extract and process them |
| 119 | + if (followLinks) { |
| 120 | + const basePath = path.dirname(normalizedPath); |
| 121 | + const internalLinks = extractInternalLinks(content, basePath); |
| 122 | + |
| 123 | + for (const link of internalLinks) { |
| 124 | + const linkedFileResult = readMarkdown(link.resolvedPath, { |
| 125 | + followLinks, |
| 126 | + maxDepth, |
| 127 | + visited: new Set(visited), // Create a new set for each branch |
| 128 | + currentDepth: currentDepth + 1 |
| 129 | + }); |
| 130 | + |
| 131 | + result.linkedFiles.push({ |
| 132 | + linkText: link.text, |
| 133 | + originalPath: link.path, |
| 134 | + ...linkedFileResult |
| 135 | + }); |
| 136 | + } |
| 137 | + } |
| 138 | + |
| 139 | + return result; |
| 140 | +} |
| 141 | + |
| 142 | +/** |
| 143 | + * @module readMarkdownWithLinks |
| 144 | + * @description Reads a markdown file and follows internal links to create a comprehensive document view. |
| 145 | + * Prevents circular references and supports configurable depth limits. |
| 146 | + * @param {string} filePath - Path to the markdown file to read |
| 147 | + * @param {Object} [options={}] - Configuration options for link following |
| 148 | + * @param {boolean} [options.followLinks=true] - Whether to follow internal links |
| 149 | + * @param {number} [options.maxDepth=3] - Maximum depth to follow links |
| 150 | + * @param {boolean} [options.structured=false] - Return structured data instead of combined text |
| 151 | + * @returns {string} Combined content of the file and all linked files with headers |
| 152 | + * @example {{ "docs/README.md" | readMarkdownWithLinks }} |
| 153 | + * @example {{ "docs/README.md" | readMarkdownWithLinks(maxDepth=2) }} |
| 154 | + * @license MIT |
| 155 | + */ |
| 156 | +function readMarkdownWithLinks(filePath, options = {}) { |
| 157 | + const { |
| 158 | + followLinks = true, |
| 159 | + maxDepth = 3, |
| 160 | + structured = false |
| 161 | + } = options; |
| 162 | + |
| 163 | + const result = readMarkdown(filePath, { |
| 164 | + followLinks, |
| 165 | + maxDepth, |
| 166 | + visited: new Set(), |
| 167 | + currentDepth: 0 |
| 168 | + }); |
| 169 | + |
| 170 | + // Return structured data if requested |
| 171 | + if (structured) { |
| 172 | + return result; |
| 173 | + } |
| 174 | + |
| 175 | + // Otherwise return combined content |
| 176 | + function combineContent(fileResult, depth = 0) { |
| 177 | + const indent = ' '.repeat(depth); |
| 178 | + let combined = ''; |
| 179 | + |
| 180 | + if (fileResult.content) { |
| 181 | + combined += `${indent}=== ${path.basename(fileResult.path)} ===\n`; |
| 182 | + combined += fileResult.content + '\n\n'; |
| 183 | + } |
| 184 | + |
| 185 | + if (fileResult.linkedFiles) { |
| 186 | + for (const linkedFile of fileResult.linkedFiles) { |
| 187 | + combined += combineContent(linkedFile, depth + 1); |
| 188 | + } |
| 189 | + } |
| 190 | + |
| 191 | + return combined; |
| 192 | + } |
| 193 | + |
| 194 | + return combineContent(result); |
| 195 | +} |
| 196 | + |
| 197 | +module.exports = readMarkdownWithLinks; |
| 198 | + |
| 199 | + |
| 200 | + |
| 201 | + |
| 202 | +// ============================================================================ |
| 203 | +// TESTS (for local development only) |
| 204 | +// ============================================================================ |
| 205 | +if (require.main === module) { |
| 206 | + const fs = require('fs'); |
| 207 | + |
| 208 | + function assert(condition, message) { |
| 209 | + if (!condition) { console.error(`❌ ${message}`); process.exit(1); } |
| 210 | + console.log(`✅ ${message}`); |
| 211 | + } |
| 212 | + |
| 213 | + // Setup |
| 214 | + fs.mkdirSync('./test-files/sub', { recursive: true }); |
| 215 | + fs.writeFileSync('./test-files/main.md', '# Main\n[Related](./related.md)\n[Another](./another.md)\n[External](https://example.com)'); |
| 216 | + fs.writeFileSync('./test-files/related.md', '# Related\n[Sub](./sub/subdoc.md)'); |
| 217 | + fs.writeFileSync('./test-files/another.md', '# Another'); |
| 218 | + fs.writeFileSync('./test-files/sub/subdoc.md', '# Sub\n[Main](../main.md)'); |
| 219 | + |
| 220 | + console.log('🧪 Running tests\n'); |
| 221 | + |
| 222 | + // Test 1: Basic reading |
| 223 | + let r = readMarkdown('./test-files/main.md', { followLinks: false }); |
| 224 | + assert(r.content?.includes('# Main'), 'Basic file reading'); |
| 225 | + |
| 226 | + // Test 2: Link following |
| 227 | + r = readMarkdown('./test-files/main.md', { maxDepth: 2 }); |
| 228 | + console.log(r.linkedFiles[0]) |
| 229 | + assert(r.linkedFiles.length === 2, 'Follows 2 links'); |
| 230 | + assert(r.linkedFiles[0].linkedFiles.length === 1, 'Nested link following'); |
| 231 | + |
| 232 | + // Test 3: Circular reference |
| 233 | + r = readMarkdown('./test-files/main.md', { maxDepth: 5 }); |
| 234 | + const circularRef = r.linkedFiles[0].linkedFiles[0].linkedFiles[0]; |
| 235 | + assert(circularRef?.error === 'Circular reference detected', 'Circular reference detection'); |
| 236 | + |
| 237 | + // Test 4: Depth limit |
| 238 | + r = readMarkdown('./test-files/main.md', { maxDepth: 1 }); |
| 239 | + assert(r.linkedFiles[0].linkedFiles.length === 0, 'Depth limit respected'); |
| 240 | + |
| 241 | + // Test 5: Non-existent file |
| 242 | + r = readMarkdown('./test-files/missing.md'); |
| 243 | + assert(r.error === 'File not found or could not be read', 'Non-existent file handling'); |
| 244 | + |
| 245 | + // Test 6: Combined output |
| 246 | + const combined = readMarkdownWithLinks('./test-files/main.md', { maxDepth: 1 }); |
| 247 | + assert(combined.includes('=== main.md ==='), 'Combined format includes headers'); |
| 248 | + assert(combined.includes(' === related.md ==='), 'Nested files indented'); |
| 249 | + |
| 250 | + // Test 7: Path traversal blocked |
| 251 | + r = readMarkdown('../../../etc/passwd'); |
| 252 | + assert(r.content === null, 'Path traversal blocked'); |
| 253 | + assert(r.error === 'File not found or could not be read', 'Path traversal returns error'); |
| 254 | + |
| 255 | + // Test 8: Absolute path blocked |
| 256 | + const content1 = readFile('/etc/passwd'); |
| 257 | + assert(content1 === null, 'Absolute Unix path blocked'); |
| 258 | + |
| 259 | + const content2 = readFile('C:\\Windows\\System32\\config'); |
| 260 | + assert(content2 === null, 'Absolute Windows path blocked'); |
| 261 | + |
| 262 | + // Test 9: Empty file handling |
| 263 | + fs.writeFileSync('./test-files/empty.md', ''); |
| 264 | + r = readMarkdown('./test-files/empty.md'); |
| 265 | + assert(r.content === '', 'Empty file handled'); |
| 266 | + assert(r.linkedFiles.length === 0, 'Empty file has no links'); |
| 267 | + |
| 268 | + console.log('\n🎉 All tests passed!'); |
| 269 | + fs.rmSync('./test-files', { recursive: true }); |
| 270 | +} |
0 commit comments