-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathllm_documentcloud.py
More file actions
143 lines (112 loc) · 3.65 KB
/
llm_documentcloud.py
File metadata and controls
143 lines (112 loc) · 3.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
import re
from urllib.parse import parse_qs, urlsplit
from typing import List, Optional
import llm
from documentcloud import DocumentCloud
DC_USERNAME = os.environ.get("DC_USERNAME")
DC_PASSWORD = os.environ.get("DC_PASSWORD")
PATH_RE = re.compile(r"^/documents/(\d+)-([-\w]+)/?")
PAGE_RE = re.compile(r"document/p(\d+)")
# normalize possible modes
MODES = {
"document": "text",
"text": "text",
"image": "image",
"images": "image",
"grid": "image",
"pdf": "pdf",
"raw": "pdf",
}
class DCArgs:
"Helper class for passing around args"
def __init__(self, id: str | int, mode: str = "text", page: Optional[int] = None):
self.id = id
self.mode = mode
self.page = page
def __eq__(self, other):
tuple(self) == tuple(other)
def __iter__(self):
return iter((self.id, self.mode, self.page))
def __repr__(self):
return f"({self.id}, {self.mode}, {self.page})"
@llm.hookimpl
def register_fragment_loaders(register):
register("dc", load_document)
def load_document(
argument: str,
) -> llm.Fragment | List[llm.Fragment] | llm.Attachment | List[llm.Attachment]:
"""
Load a document by ID or URL and return a fragment or attachment, depending on the mode
"""
# URL or ID?
if argument.startswith("https://"):
args = parse_dc_url(argument)
else:
args = parse_dc_id(argument)
client = DocumentCloud(DC_USERNAME, DC_PASSWORD)
doc = client.documents.get(args.id) # let this error for Not Found
# handle modes
if args.mode == "pdf":
# pdf mode is just the whole doc
return llm.Attachment(url=doc.get_pdf_url())
# images
if args.mode == "image":
# one page
if args.page:
return llm.Attachment(url=doc.get_image_url(page=args.page, size="large"))
# all pages
return [llm.Attachment(url=u) for u in doc.get_image_url_list(size="large")]
# text
if args.page:
text = doc.get_page_text(args.page)
else:
text = doc.full_text
# it's possible this will create duplication, since there are multiple ways to get the same resource
# consider normalizing URLs
source = f"dc:{argument}"
return llm.Fragment(text, source)
def parse_dc_url(url: str) -> DCArgs:
"""
Parse a document URL into ID, mode and page
>>> args = parse_dc_url("https://www.documentcloud.org/documents/25507045-20250118-ufc-intuit-dome-athlete-pay-and-weights-c-amico/?mode=images")
>>> print(args)
(25507045, image, None)
"""
u = urlsplit(url)
if not (m := PATH_RE.match(u.path)):
raise ValueError(f"No ID found in URL: {url}")
try:
id = m.group(1)
except IndexError:
raise ValueError("Invalid URL")
qs = parse_qs(u.query)
mode = qs.get("mode", ["text"])[0]
mode = MODES.get(mode, "text")
page = None
if u.fragment:
if m := PAGE_RE.match(u.fragment):
page = int(m.group(1))
return DCArgs(id, mode, page)
def parse_dc_id(doc_id: str) -> DCArgs:
"""
Parse an ID string, returning the ID, mode and page
>>> args = parse_dc_id("25507045?mode=pdf&page=1")
>>> print(args)
(25507045, pdf, 1)
"""
if not "?" in doc_id:
return DCArgs(doc_id.strip())
doc_id, query = doc_id.split("?", 1)
qs = parse_qs(query)
mode = qs.get("mode", ["text"])[0]
mode = MODES.get(mode, "text")
# Get page number if specified
page = None
if "page" in qs:
try:
page = int(qs["page"][0])
except ValueError:
# If page is not a valid integer, keep it as None
pass
return DCArgs(doc_id, mode, page)