paper-convert-scripts/pdf_checker.py at main · pnb/paper-convert-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import argparse
import os
import tempfile
import shlex
import shutil
import subprocess
import re

from PIL import Image
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTTextLineHorizontal
import numpy as np

import shared

ap = argparse.ArgumentParser(description="Check a PDF for any known issues")
ap.add_argument("pdf_path", help="PDF file path")
ap.add_argument("--max-preref-pages", type=int, help="Max allowed pages before refs")
args = ap.parse_args()


def count_nonblank_pixels(img: Image, x1: int, y1: int, x2: int, y2: int) -> int:
    """Return the number of non-blank pixels in the given rectangle. Assumes image is
    in grayscale via `img.convert("L")`."""
    img = img.crop((x1, y1, x2, y2))
    return sum(1 for pixel in img.getdata() if pixel < 255)


tmpdir = tempfile.mkdtemp()
shutil.copyfile(args.pdf_path, os.path.join(tmpdir, os.path.basename(args.pdf_path)))
retcode = shared.exec_grouping_subprocesses(
    "convert -density 100 -background white -alpha remove -alpha off "
    + shlex.quote(os.path.basename(args.pdf_path))
    + " page-%d.png",
    shell=True,
    cwd=tmpdir,
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL,
)
if retcode != 0:
    exit(retcode)

# Count non-blank pixels in margin to see if margins are correctly empty
for fname in os.listdir(tmpdir):
    if fname.startswith("page-"):
        page_num = int(fname.split("-")[1].split(".")[0]) + 1
        with Image.open(os.path.join(tmpdir, fname)) as img:
            img = img.convert("L")
            if img.size != (850, 1100):
                print(
                    "page size: Page",
                    page_num,
                    "is the wrong size; should be 8.5 × 11 inches, found",
                    img.size[0] / 100,
                    "×",
                    img.size[1] / 100,
                )
                continue  # If page is wrong size, nothing else can be checked well
            if count_nonblank_pixels(img, 0, 0, 65, 1100) > 20:
                print("margins: Page", page_num, "has content in left margin")
            if (
                count_nonblank_pixels(img, 785, 0, 850, 1100) > 20
                or count_nonblank_pixels(img, 790, 0, 850, 1100) > 0
            ):
                print("margins: Page", page_num, "has content in right margin")
            if count_nonblank_pixels(img, 0, 0, 850, 60) > 0:
                # We have to give extra space here because of the \topfraction "feature"
                # of LaTeX that allows figures into the top margin in certain cases
                print("margins: Page", page_num, "has content in top margin")
            if count_nonblank_pixels(img, 0, 1030, 850, 1100) > 0:
                print("margins: Page", page_num, "has content in bottom margin")
            # Check copyright block on first page is blank (working around instructions
            # text that is present for MSWord version)
            if page_num == 1 and (
                count_nonblank_pixels(img, 0, 880, 420, 908) > 0
                or count_nonblank_pixels(img, 0, 945, 420, 1005) > 0
            ):
                print("copyright block: The copyright block has unexpected content")

# Check text of the PDF to extract things like title, headings (e.g., References), and
# fonts for additional checks
preref_page_count = 0  # Count of pages before references
appendix_before_refs = False
char_font_sizes = []
title_chars = []
title = ""
for page_i, page_layout in enumerate(extract_pages(args.pdf_path)):
    chars_in_page = 0
    cur_heading = []
    text_containers = [x for x in page_layout if isinstance(x, LTTextContainer)]
    for element in text_containers:
        text_lines = [x for x in element if isinstance(x, LTTextLineHorizontal)]
        for text_line in text_lines:
            chars = [x for x in text_line if isinstance(x, LTChar)]
            for character in chars:
                if not title and character.size > 17:
                    title_chars.append(character.get_text().replace("\n", " "))
                elif not title:
                    title = "".join(title_chars).strip()
                if character.size > 11.9 and character.size < 12.1:
                    cur_heading.append(character.get_text())
                    heading_str = "".join(cur_heading).lower()
                    if (
                        re.match(
                            r"(\d*|[a-z])\.?\s*(references|acknowledge?ments?)",
                            heading_str,
                        )
                        and preref_page_count == 0
                    ):
                        preref_page_count = page_i  # Don't count this page
                        if chars_in_page > len(cur_heading):  # Unless mid-page
                            preref_page_count = page_i + 1  # Then do count this page
                    if re.match(r"\d*\.?\s*appendi(x|ces)", heading_str):
                        if preref_page_count == 0:
                            appendix_before_refs = True
                else:
                    cur_heading.clear()
                chars_in_page += 1
                char_font_sizes.append(character.size)

print("info: title=" + title)  # Not an error, just a way to get the title for later
if appendix_before_refs:
    print("appendix location: Appendices should be after the references, not before")
if args.max_preref_pages and preref_page_count > args.max_preref_pages:
    print(
        "page limit: The paper has content on",
        preref_page_count,
        "pages before references, which is more than the maximum of",
        args.max_preref_pages,
    )
mdn_font_size = np.median(char_font_sizes)
if mdn_font_size < 8.75 or mdn_font_size > 9.25:
    print("font size: The median font size is", mdn_font_size, "pt (should be 9)")