-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathall-space-content.js
More file actions
executable file
·134 lines (116 loc) · 4.03 KB
/
all-space-content.js
File metadata and controls
executable file
·134 lines (116 loc) · 4.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env node
/* eslint-disable no-console */
/* eslint-disable no-undef */
import fs from "fs";
import path from "path";
import {
convertToMarkdown,
fetchWithBackoff,
OUTPUT_DIR,
sanitizeName,
ensureDirectoryExists,
getUniqueDirectoryName,
formatApiUrl,
} from "../utils/index.js";
/**
* @typedef {import('./all-spaces.js').ConfluencePage} ConfluencePage
*/
/**
* Fetches and processes all content from a single Confluence space
* @param {string} spaceKey - The space key to process (e.g., "ENGINEERING")
* @returns {Promise<void>}
* @throws {Error} If space key is missing or API calls fail
* @example
* await scrapeConfluenceSpace("ENGINEERING");
*/
export async function scrapeConfluenceSpace(limit = Infinity) {
const spaceKey = process.argv[2];
if (!spaceKey) {
throw new Error("Please provide a space key as an argument");
}
console.log(`Fetching content for space: ${spaceKey}`);
const pages = await getSpaceContent(spaceKey, limit);
console.log(`Found ${pages.length} pages`);
for (const page of pages) {
const bodyStorage = page.body?.storage?.value || "";
if (bodyStorage) {
const markdownContent = convertToMarkdown(bodyStorage);
await saveToMarkdown(spaceKey, page, markdownContent);
}
}
console.log("✅ All content has been scraped and saved!");
}
/**
* Fetches all pages from a specific Confluence space
* @param {string} spaceKey - The space key to fetch content from
* @returns {Promise<ConfluencePage[]>} Array of page objects with content and metadata
* @throws {Error} If API calls fail or rate limits are exceeded
*/
async function getSpaceContent(spaceKey, limit = Infinity) {
let pages = [];
let url = formatApiUrl(`/space/${spaceKey}/content`, {
expand: "body.storage,ancestors,space,history",
limit: "100",
});
let requestCount = 0;
while (url) {
if (requestCount >= limit) {
console.log(`Request limit (${limit}) reached. Stopping...`);
break;
}
const data = await fetchWithBackoff(url);
requestCount++;
pages.push(...data.results);
url = data._links?.next || null;
if (url) {
url = formatApiUrl(url, {
expand: "body.storage,ancestors,space,history",
});
}
}
return pages;
}
/**
* Saves a page's content as a Markdown file in the appropriate directory
* @param {string} spaceKey - The space key (used for directory structure)
* @param {ConfluencePage} page - The page object containing content and metadata
* @param {string} content - The markdown content to save
* @throws {Error} If file operations fail
*/
async function saveToMarkdown(spaceKey, page, content) {
const spacePath = path.join(OUTPUT_DIR, spaceKey);
ensureDirectoryExists(spacePath);
// Create home directory for root/orphaned pages
const homePath = path.join(spacePath, "home");
ensureDirectoryExists(homePath);
const sanitizedTitle = sanitizeName(page.title);
let targetDir;
let fileName;
if (!page.ancestors || page.ancestors.length === 0) {
// Root level page - goes in home directory
targetDir = homePath;
fileName =
page.id === page.space.homePage.id
? `0_${spaceKey}.md` // Space homepage
: `${sanitizedTitle}.md`; // Other root pages
} else {
// Nested page - create parent directory structure
const parentPage = page.ancestors[page.ancestors.length - 1];
targetDir = getUniqueDirectoryName(
spacePath,
sanitizeName(parentPage.title),
);
ensureDirectoryExists(targetDir);
fileName = page.id === parentPage.id ? "index.md" : `${sanitizedTitle}.md`;
}
const filePath = path.join(targetDir, fileName);
fs.writeFileSync(filePath, content, "utf8");
console.log(`✅ Saved: ${filePath}`);
}
// Parse command line arguments
const args = process.argv.slice(2);
const limitIndex = args.findIndex((arg) => arg === "-l" || arg === "--limit");
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1], 10) : Infinity;
if (import.meta.url === process.argv[1]) {
scrapeConfluenceSpace(limit).catch((err) => console.error("Error:", err));
}