-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathread.ts
More file actions
80 lines (68 loc) · 2.2 KB
/
read.ts
File metadata and controls
80 lines (68 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/**
* Read page content - enhanced markdown conversion
*/
import { SentienceBrowser } from './browser';
import TurndownService from 'turndown';
export interface ReadOptions {
format?: 'text' | 'markdown';
enhance_markdown?: boolean;
}
export interface ReadResult {
status: 'success' | 'error';
url: string;
format: 'text' | 'markdown';
content: string;
length: number;
error?: string;
}
/**
* Read page content as text or markdown
*
* @param browser - SentienceBrowser instance
* @param options - Read options
* @returns ReadResult with page content
*/
export async function read(
browser: SentienceBrowser,
options: ReadOptions = {}
): Promise<ReadResult> {
const page = browser.getPage();
const format = options.format || 'text';
const enhanceMarkdown = options.enhance_markdown !== false; // Default to true
// Get basic content from extension
const result = (await page.evaluate(
(opts) => {
return (window as any).sentience.read(opts);
},
{ format }
)) as ReadResult;
// Enhance markdown if requested and format is markdown
if (format === 'markdown' && enhanceMarkdown && result.status === 'success') {
try {
// Get full HTML from page
const htmlContent = await page.evaluate(
() => document.documentElement.outerHTML
);
// Use turndown for better conversion
const turndownService = new TurndownService({
headingStyle: 'atx', // Use # for headings
bulletListMarker: '-', // Use - for lists
codeBlockStyle: 'fenced', // Use ``` for code blocks
});
// Add custom rules for better conversion
turndownService.addRule('strikethrough', {
filter: ['del', 's', 'strike'] as any,
replacement: (content: string) => `~~${content}~~`,
});
// Strip unwanted tags
turndownService.remove(['script', 'style', 'nav', 'footer', 'header', 'noscript']);
const enhancedMarkdown = turndownService.turndown(htmlContent);
result.content = enhancedMarkdown;
result.length = enhancedMarkdown.length;
} catch (e) {
// If enhancement fails, use extension's result
result.error = `Markdown enhancement failed: ${e}`;
}
}
return result;
}