-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathread_markdown_async.py
More file actions
58 lines (45 loc) · 2.2 KB
/
read_markdown_async.py
File metadata and controls
58 lines (45 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""
Example: Reading page content and converting to markdown (Async version)
This example shows how to use the read_async() function to get page content
and convert it to high-quality markdown using markdownify.
"""
import asyncio
import os
from markdownify import markdownify
from sentience.async_api import AsyncSentienceBrowser, read_async
async def main():
# Get API key from environment variable (optional - uses free tier if not set)
api_key = os.environ.get("SENTIENCE_API_KEY")
# Initialize browser
async with AsyncSentienceBrowser(api_key=api_key, headless=True) as browser:
# Navigate to a page
await browser.goto("https://example.com", wait_until="domcontentloaded")
# Method 1: Get raw HTML (default) and convert with markdownify
print("=== Method 1: Raw HTML + markdownify (Recommended) ===")
result = await read_async(browser) # output_format="raw" is default
html_content = result["content"]
# Convert to markdown using markdownify (better quality)
markdown = markdownify(
html_content,
heading_style="ATX", # Use # for headings
bullets="-", # Use - for lists
strip=["script", "style", "nav", "footer", "header"], # Strip unwanted tags
)
print(f"Markdown length: {len(markdown)} characters")
print(markdown[:500]) # Print first 500 chars
print("\n")
# Method 2: Get high-quality markdown directly (uses markdownify internally)
print("=== Method 2: Direct markdown (High-quality via markdownify) ===")
result = await read_async(browser, output_format="markdown")
high_quality_markdown = result["content"]
print(f"Markdown length: {len(high_quality_markdown)} characters")
print(high_quality_markdown[:500]) # Print first 500 chars
print("\n")
# Method 3: Get plain text
print("=== Method 3: Plain text ===")
result = await read_async(browser, output_format="text")
text_content = result["content"]
print(f"Text length: {len(text_content)} characters")
print(text_content[:500]) # Print first 500 chars
if __name__ == "__main__":
asyncio.run(main())