Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci-mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
strategy:
matrix:
python-version: [
"3.8", "3.12"
"3.9", "3.13"
]

steps:
Expand Down
69 changes: 69 additions & 0 deletions .github/workflows/ci_fuzz_linux.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: "HTML2PDF4Doc Fuzz Testing on Linux"

on:
pull_request:
branches: [ "**" ]

jobs:
build:
runs-on: ubuntu-latest
timeout-minutes: 120 # 2 hours

strategy:
matrix:
python-version: [
"3.12"
]

steps:
- uses: actions/checkout@v3

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
with:
python-version: ${{ matrix.python-version }}

- name: Upgrade pip
run: |
python -m pip install --upgrade pip

- name: Install Python packages
run: |
pip install -r requirements.development.txt

- name: Clone html2pdf4doc.js
run: |
invoke bootstrap
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Install html2pdf4doc dependencies.
run: |
python developer/pip_install_html2pdf4doc_deps.py

- name: Run Lint tasks
run: |
invoke lint

- name: Build HTML2PDF4Doc.js
run: |
invoke build

- name: Run tests
run: |
if [ "${{ github.event_name }}" = "schedule" ]; then
echo "🕒 Running long fuzzing..."
invoke test-fuzz --long
else
echo "🚀 Running short fuzzing..."
invoke test-fuzz
fi

- name: Upload broken PDFs as artifact
# Always upload, even if job fails.
if: failure() || always()
uses: actions/upload-artifact@v4
with:
name: broken-pdfs
path: output/
retention-days: 30
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,7 @@ tests/integration/.lit_test_times.txt
tests/integration/**/Output/
output/

__pycache__/

# Fuzz testing files.
**.mut.**
1 change: 1 addition & 0 deletions html2pdf4doc/html2pdf4doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

__version__ = "0.0.22"

PATH_TO_HTML2PDF4DOC_PY = __file__
PATH_TO_HTML2PDF4DOC_JS = os.path.join(
os.path.dirname(os.path.join(__file__)),
"html2pdf4doc_js",
Expand Down
193 changes: 193 additions & 0 deletions html2pdf4doc/html2pdf4doc_fuzzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import argparse
import contextlib
import datetime
import os.path
import random
import shutil
import sys
from pathlib import Path
from subprocess import CalledProcessError, CompletedProcess, TimeoutExpired, run
from time import time
from typing import Iterator, List

from faker import Faker
from lxml import etree, html

from html2pdf4doc import PATH_TO_HTML2PDF4DOC_PY


@contextlib.contextmanager
def measure_performance(title: str) -> Iterator[None]:
time_start = time()
yield
time_end = time()

time_diff = time_end - time_start
padded_name = f"{title} ".ljust(60, ".")
padded_time = f" {time_diff:0.2f}".rjust(6, ".")
print(f"{padded_name}{padded_time}s", flush=True) # noqa: T201


def mutate_and_print(path_to_input_file: str, path_to_root: str) -> bool:
assert os.path.isfile(path_to_input_file), path_to_input_file
assert os.path.isdir(path_to_root), path_to_root
if not os.path.abspath(path_to_root):
path_to_root = os.path.abspath(path_to_root)

text = open(path_to_input_file, encoding="utf-8").read()

# Parse HTML into DOM
tree = html.fromstring(text)

# Pick a random element
elems = tree.xpath("//p | //td")
if elems:
for _i in range(25):
node = random.choice(elems)

print("Mutating node:", node.tag, flush=True) # noqa: T201

n_sentences = random.randint(1, 100)

fake = Faker()
extra_text = fake.text(max_nb_chars=10 * n_sentences)

node.text = extra_text

# Serialize back to HTML
mutated_html = etree.tostring(
tree, pretty_print=False, method="html", encoding="unicode"
)

# Save next to input file
path_to_mut_html = path_to_input_file + ".mut.html"
path_to_mut_pdf = path_to_input_file + ".mut.html.pdf"
with open(path_to_mut_html, "w", encoding="utf-8") as f:
f.write(mutated_html)

print("Wrote mutated file:", path_to_mut_html, flush=True) # noqa: T201

paths_to_print = [(path_to_mut_html, path_to_mut_pdf)]

cmd: List[str] = [
sys.executable,
PATH_TO_HTML2PDF4DOC_PY,
"print",
"--strict",
]

for path_to_print_ in paths_to_print:
cmd.append(path_to_print_[0])
cmd.append(path_to_print_[1])

relative_path_to_mut_html = Path(path_to_root).relative_to(".")
path_to_mut_output = f"output/{relative_path_to_mut_html}"

def copy_files_if_needed() -> None:
if os.path.isdir(path_to_mut_output):
return

shutil.rmtree("output", ignore_errors=True)
Path("output").mkdir(parents=True, exist_ok=True)

shutil.copytree(
"html2pdf4doc", "output/html2pdf4doc", dirs_exist_ok=True
)

shutil.rmtree(path_to_mut_output, ignore_errors=True)
Path(path_to_mut_output).mkdir(parents=True, exist_ok=True)

shutil.copytree(path_to_root, path_to_mut_output, dirs_exist_ok=True)

def copy_mutated_file() -> None:
relative_path_to_mut_html = Path(path_to_mut_html).relative_to(
path_to_root
)

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
path_to_mut_html_out = os.path.join(
path_to_mut_output,
f"{relative_path_to_mut_html}.{timestamp}.html",
)
shutil.copy(path_to_mut_html, path_to_mut_html_out)

path_to_mut_pdf_out = os.path.join(
path_to_mut_output,
f"{relative_path_to_mut_html}.{timestamp}.pdf",
)
shutil.copy(path_to_mut_pdf, path_to_mut_pdf_out)

print( # noqa: T201
f"Saved failed mutated HTML as:\n"
f"HTML: {path_to_mut_html_out}\n"
f"PDF: {path_to_mut_pdf_out}"
)

with measure_performance(
"html2pdf4doc_fuzzer: printing HTML to PDF using HTML2PDF and Chrome Driver"
):
try:
_: CompletedProcess[bytes] = run(
cmd, capture_output=False, check=True, bufsize=1
)
except CalledProcessError as called_process_error_:
print(called_process_error_) # noqa: T201

copy_files_if_needed()

copy_mutated_file()

return False
except TimeoutExpired:
raise TimeoutError from None
return True


def main() -> None:
parser = argparse.ArgumentParser()

parser.add_argument("input_file", type=str, help="TODO")
parser.add_argument("root_path", type=str, help="TODO")
parser.add_argument(
"--long",
action="store_true",
help="Run the fuzzer in long mode (more iterations).",
)

args = parser.parse_args()

path_to_input_file = args.input_file
path_to_root = args.root_path

shutil.rmtree("output", ignore_errors=True)
Path("output").mkdir(parents=True, exist_ok=True)

total_runs = 200 if args.long else 20
success_count, failure_count = 0, 0
for i in range(1, total_runs + 1):
print( # noqa: T201
f"html2pdf4doc_fuzzer print cycle #{i}/{total_runs} — "
f"So far: 🟢{success_count} / 🔴{failure_count}",
flush=True,
)
success = mutate_and_print(path_to_input_file, path_to_root)
if success:
success_count += 1
else:
failure_count += 1

assert total_runs > 0
success_rate_percent = (success_count / total_runs) * 100

print( # noqa: T201
f"html2pdf4doc_fuzzer: finished {'✅' if failure_count == 0 else '❌'} — "
f"Success rate: {success_count}/{total_runs} ({success_rate_percent}%)",
flush=True,
)

if failure_count > 0:
sys.exit(1)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ development = [

[project.scripts]
html2pdf4doc = "html2pdf4doc.html2pdf4doc:main"
html2pdf4doc_fuzzer = "html2pdf4doc.html2pdf4doc_fuzzer:main"

[project.urls]
Changelog = "https://github.com/mettta/html2pdf_python/releases/"
Expand Down
6 changes: 6 additions & 0 deletions requirements.development.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,9 @@ ruff>=0.9
#
lit
filecheck==0.0.24

#
# Fuzz tests
#
faker>=37.8.0
lxml>=5.3.0
12 changes: 12 additions & 0 deletions tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,18 @@ def test_integration(
run_invoke(context, itest_command)


@task(aliases=["tf"])
def test_fuzz(context):
run_invoke(
context,
"""
python html2pdf4doc/html2pdf4doc_fuzzer.py
tests/fuzz/01_strictdoc_guide_202510/strictdoc/docs/strictdoc_01_user_guide-PDF.html
tests/fuzz/01_strictdoc_guide_202510/
""",
)


@task(aliases=["t"])
def test(context):
test_integration(context)
Expand Down
Loading
Loading