diff --git a/html2pdf4doc/html2pdf4doc.py b/html2pdf4doc/html2pdf4doc.py
index 491028e..32ac15b 100644
--- a/html2pdf4doc/html2pdf4doc.py
+++ b/html2pdf4doc/html2pdf4doc.py
@@ -1,6 +1,7 @@
import argparse
import atexit
import base64
+import contextlib
import os.path
import platform
import re
@@ -9,10 +10,11 @@
import zipfile
from datetime import datetime
from pathlib import Path
-from time import sleep
-from typing import Dict, List, Optional
+from time import sleep, time
+from typing import Dict, Iterator, List, Optional, Tuple
import requests
+from pypdf import PdfReader
from requests import Response
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
@@ -39,6 +41,28 @@
sys.stdout = open(sys.stdout.fileno(), mode="w", encoding="utf8", closefd=False)
+@contextlib.contextmanager
+def measure_performance(title: str) -> Iterator[None]:
+ time_start = time()
+ yield
+ time_end = time()
+
+ time_diff = time_end - time_start
+ padded_name = f"{title} ".ljust(60, ".")
+ padded_time = f" {time_diff:0.2f}".rjust(6, ".")
+ print(f"{padded_name}{padded_time}s", flush=True) # noqa: T201
+
+
+def extract_page_count(logs: List[Dict[str, str]]) -> int:
+ pattern = re.compile(r'"\[HTML2PDF4DOC]\s*Page count:"\s*(\d+)')
+ for entry_ in logs:
+ log_message = entry_["message"]
+ match = pattern.search(log_message)
+ if match:
+ return int(match.group(1))
+ raise ValueError("No page count found in logs.")
+
+
class ChromeDriverManager:
def get_chrome_driver(self, path_to_cache_dir: str) -> str:
chrome_version: Optional[str] = self.get_chrome_version()
@@ -253,7 +277,7 @@ def get_inches_from_millimeters(mm: float) -> float:
return mm / 25.4
-def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
+def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> Tuple[bytes, int]:
print(f"html2pdf4doc: opening URL with ChromeDriver: {url}") # noqa: T201
driver.get(url)
@@ -285,21 +309,27 @@ def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
}
class Done(Exception):
- pass
+ def __init__(self, page_count: int):
+ super().__init__()
+ self.page_count: int = page_count
datetime_start = datetime.today()
logs: List[Dict[str, str]] = []
+ page_count: int = 0
try:
while True:
logs = driver.get_log("browser") # type: ignore[no-untyped-call]
for entry_ in logs:
if "[HTML2PDF4DOC] Total time:" in entry_["message"]:
print("success: HTML2PDF4Doc completed its job.") # noqa: T201
- raise Done
+
+ page_count = extract_page_count(logs)
+
+ raise Done(page_count)
if (datetime.today() - datetime_start).total_seconds() > 60:
raise TimeoutError
- sleep(0.5)
+ sleep(0.1)
except Done:
pass
except TimeoutError:
@@ -322,7 +352,13 @@ class Done(Exception):
result = driver.execute_cdp_cmd("Page.printToPDF", calculated_print_options)
data = base64.b64decode(result["data"])
- return data
+
+ if page_count == 0:
+ raise RuntimeError(
+ "html2pdf4doc: Something went wrong. "
+ "Could not capture the printed page count from Chrome."
+ )
+ return data, page_count
def create_webdriver(
@@ -521,9 +557,20 @@ def exit_handler() -> None:
url = Path(os.path.abspath(path_to_input_html)).as_uri()
- pdf_bytes = get_pdf_from_html(driver, url)
+ pdf_bytes, page_count = get_pdf_from_html(driver, url)
with open(path_to_output_pdf, "wb") as f:
f.write(pdf_bytes)
+
+ with measure_performance("html2pdf4doc: validating page count"):
+ reader = PdfReader(path_to_output_pdf)
+ if len(reader.pages) != page_count:
+ raise RuntimeError(
+ "Something went wrong with the printed page. "
+ f"Page count mismatch: "
+ f"PDF pages: {len(reader.pages)}, "
+ f"html2pdf4doc pages: {page_count}."
+ )
+
else:
print("html2pdf4doc: unknown command.") # noqa: T201
sys.exit(1)
diff --git a/pyproject.toml b/pyproject.toml
index ea8c6e6..565cc59 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,9 @@ dependencies = [
# requests is used for downloading the Chrome driver.
"requests",
+
+ # pypdf is used for validating the printed PDF.
+ "pypdf>=3.9.0",
]
[project.optional-dependencies]
diff --git a/requirements.development.txt b/requirements.development.txt
index b3f7d5b..17b7491 100644
--- a/requirements.development.txt
+++ b/requirements.development.txt
@@ -16,6 +16,3 @@ ruff>=0.9
#
lit
filecheck==0.0.24
-
-# Integration tests use PyPDF to check the contents of the printed PDF.
-pypdf==3.9.0