Skip to content

Commit d007316

Browse files
committed
feat: enhance PDF generation by adding Markdown support and custom CSS integration
1 parent c0ff5cf commit d007316

2 files changed

Lines changed: 90 additions & 10 deletions

File tree

pdfdol/tools.py

Lines changed: 88 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,14 @@
44
from typing import Literal, Callable, Union
55
import os
66
import io
7+
8+
import markdown
9+
import pdfkit
10+
711
from dol import Pipe
812

913
# Define the allowed source kinds
10-
SrcKind = Literal["url", "html", "file"]
14+
SrcKind = Literal["url", "html", "file", "md", "markdown", "text"]
1115

1216

1317
def _resolve_src_kind(src: str) -> SrcKind:
@@ -42,7 +46,10 @@ def _resolve_src_kind(src: str) -> SrcKind:
4246
elif "<html" in s.lower():
4347
return "html"
4448
elif os.path.exists(s):
45-
return "file"
49+
if s.endswith(".html") or s.endswith(".htm") or s.endswith(".xhtml"):
50+
return "file"
51+
else:
52+
return "markdown"
4653
else:
4754
# Fallback: if it doesn't look like a URL or a file exists, assume it's text.
4855
return "text"
@@ -92,6 +99,73 @@ def write_to_file(b: bytes) -> str:
9299
raise ValueError("egress must be None, a file path string, or a callable.")
93100

94101

102+
dflt_css = """
103+
<style>
104+
table {
105+
width: 100%;
106+
border-collapse: collapse;
107+
border: 1px solid black;
108+
}
109+
th, td {
110+
border: 1px solid black;
111+
padding: 8px;
112+
text-align: left;
113+
}
114+
th {
115+
background-color: #f2f2f2;
116+
}
117+
</style>
118+
"""
119+
120+
121+
def add_css(html_text: str, css=dflt_css) -> str:
122+
return f"<html><head>{custom_css}</head><body>{html_text}</body></html>"
123+
124+
125+
# Save to PDF with pdfkit
126+
dflt_pdfkit_kwargs = {
127+
"options": {
128+
"encoding": "UTF-8",
129+
"page-size": "A4",
130+
"margin-top": "10mm",
131+
"margin-right": "10mm",
132+
"margin-bottom": "10mm",
133+
"margin-left": "10mm",
134+
}
135+
}
136+
137+
dflt_markdown_kwargs = {
138+
"extensions": ("extra", "tables"),
139+
}
140+
141+
142+
def markdown_to_pdf(
143+
md_src: str,
144+
egress: Union[None, str, Callable] = None,
145+
*,
146+
markdown_extensions=dflt_markdown_kwargs,
147+
**pdfkit_kwargs,
148+
):
149+
pdfkit_kwargs = {**dflt_pdfkit_kwargs, **pdfkit_kwargs}
150+
151+
if isinstance(md_src, str) and os.path.isfile(md_src):
152+
md_file = md_src
153+
with open(md_file, "r", encoding="utf-8") as f:
154+
md_src = f.read()
155+
156+
# Convert Markdown to HTML
157+
html_text = markdown.markdown(md_src, **dflt_markdown_kwargs)
158+
159+
if not callable(egress):
160+
pdf_target = egress
161+
return pdfkit.from_string(html_text, pdf_target, **pdfkit_kwargs)
162+
else:
163+
# if egress is a function, we'll get the bytes for the PDF
164+
# and apply egress to them
165+
pdf_bytes = pdfkit.from_string(html_text, None)
166+
return egress(pdf_bytes)
167+
168+
95169
def get_pdf(
96170
src: str,
97171
egress: Union[None, str, Callable] = None,
@@ -136,7 +210,7 @@ def get_pdf(
136210
css: (optional) string with path to css file which will be added to a single input file
137211
configuration: (optional) instance of pdfkit.configuration.Configuration()
138212
cover_first: (optional) if True, cover always precedes TOC
139-
:verbose: (optional) By default '--quiet' is passed to all calls, set this to False to get wkhtmltopdf output to stdout.
213+
verbose: (optional) By default '--quiet' is passed to all calls, set this to False to get wkhtmltopdf output to stdout.
140214
141215
142216
Returns:
@@ -160,9 +234,8 @@ def get_pdf(
160234
161235
162236
"""
163-
import pdfkit
164-
165237
_kwargs = dict(
238+
dflt_pdfkit_kwargs,
166239
options=options,
167240
toc=toc,
168241
cover=cover,
@@ -175,19 +248,24 @@ def get_pdf(
175248
# Determine the source kind if not explicitly provided.
176249
if src_kind is None:
177250
src_kind = _resolve_src_kind(src)
251+
elif src_kind == 'md':
252+
src_kind = 'markdown'
178253

179254
if src_kind == "url":
180255
_kwargs.pop(
181256
"css", None
182257
) # because from_url, for some reason, doesn't have a css argument
183258

184-
_add_options = lambda func: partial(func, **_kwargs, **kwargs)
259+
_pdfkit_kwargs = dict(**_kwargs, **kwargs)
260+
_add_pdfkit_options = lambda func: partial(func, **_pdfkit_kwargs)
185261
# Map the source kind to the corresponding pdfkit function.
186262
func_for_kind = {
187-
"url": _add_options(pdfkit.from_url),
188-
"text": _add_options(pdfkit.from_string),
189-
"html": Pipe(io.StringIO, _add_options(pdfkit.from_file)),
190-
"file": _add_options(pdfkit.from_file),
263+
"url": _add_pdfkit_options(pdfkit.from_url),
264+
"text": _add_pdfkit_options(pdfkit.from_string),
265+
"html": Pipe(io.StringIO, _add_pdfkit_options(pdfkit.from_file)),
266+
"file": _add_pdfkit_options(pdfkit.from_file),
267+
# egress=None to force bytes output in markdown:
268+
"markdown": partial(markdown_to_pdf, egress=None, **_pdfkit_kwargs),
191269
}
192270
src_to_bytes_func = func_for_kind.get(src_kind)
193271
if src_to_bytes_func is None:

setup.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,7 @@ zip_safe = False
1818
install_requires =
1919
dol
2020
pypdf
21+
markdown
22+
pdfkit
2123

2224

0 commit comments

Comments
 (0)