Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions formfyxer/lit_explorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,9 @@ def _rewrite_pdf_fields_in_place(
"PDF field traversal count did not match parsed field-name count"
)

for field_data, old_name, new_name in zip(flattened_fields, field_names, new_names):
for field_data, old_name, new_name in zip(
flattened_fields, field_names, new_names
):
cleaned_name = re.sub(r"^\*", "", new_name)
if old_name == cleaned_name:
continue
Expand All @@ -159,7 +161,9 @@ def _rewrite_pdf_fields_in_place(
continue

# Nested fields keep their parent hierarchy; flat fields accept full dotted names.
target_name = cleaned_name.split(".")[-1] if "." in old_name else cleaned_name
target_name = (
cleaned_name.split(".")[-1] if "." in old_name else cleaned_name
)
target.T = target_name

my_pdf.save(in_file)
Expand Down
36 changes: 21 additions & 15 deletions formfyxer/pdf_wrangling.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,31 +468,37 @@ def _unnest_pdf_fields(
parent_name = []
if hasattr(field, "T"):
parent_name.append(str(field.T))
if hasattr(field, "FT") and hasattr(field, "F"):
# PDF fields have bit flags for specific options. The 17th bit (or hex
# 10000) on Buttons mark a "push button", w/o a permanent value
# (e.g. "Print this PDF") They aren't really fields, just skip them.
if hasattr(field, "Ff") and field.FT == "/Btn" and bool(field.Ff & 0x10000):
return []
return [{"type": field.FT, "var_name": ".".join(parent_name), "all": field}]
elif parent_type and parent_flags and hasattr(field, "F"):
if parent_flags and parent_type == "/Btn" and bool(parent_flags & 0x10000):
return []
return [{"type": parent_type, "var_name": ".".join(parent_name), "all": field}]
elif hasattr(field, "Kids"):
effective_type = str(field.FT) if hasattr(field, "FT") else parent_type
effective_flags = int(field.Ff) if hasattr(field, "Ff") else parent_flags

if hasattr(field, "Kids"):
return [
y
for x in field.Kids
for y in _unnest_pdf_fields(
x,
copy(parent_name),
str(field.FT) if hasattr(field, "FT") else None,
int(field.Ff) if hasattr(field, "Ff") else None,
effective_type,
effective_flags,
)
]
else:

if not effective_type:
return []

# PDF fields have bit flags for specific options. The 17th bit (or hex
# 10000) on Buttons mark a "push button", w/o a permanent value
# (e.g. "Print this PDF"). They aren't really fields, just skip them.
if effective_type == "/Btn" and bool((effective_flags or 0) & 0x10000):
return []

if hasattr(field, "FT") or hasattr(field, "F"):

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this affects the particular PDF you were trying to fix, but from what I understand there could be terminal fields where the field type is defined in the parent. The annotation flag ("F") can either be a separate Widget annotation (like you've addressed in this PR) or it can be merged with the terminal field.

So I think this should be:

Suggested change
if hasattr(field, "FT") or hasattr(field, "F"):
if effective_type or hasattr(field, "F"):

I'm still unclear if needs to be present or not. It most cases it is (to set the "Print" flag for the field to show up when the document is printed).

return [
{"type": effective_type, "var_name": ".".join(parent_name), "all": field}
]

return []


def has_fields(pdf_file: str) -> bool:
"""
Expand Down
12 changes: 9 additions & 3 deletions formfyxer/tests/test_lit_explorer_pdf_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@ def test_parse_form_accepts_model_and_openai_base_url(self):
)
@patch("formfyxer.lit_explorer.unlock_pdf_in_place")
@patch("formfyxer.lit_explorer.pikepdf.open")
@patch("formfyxer.lit_explorer.get_openai_api_key_from_sources", return_value="sk-test")
@patch(
"formfyxer.lit_explorer.get_openai_api_key_from_sources", return_value="sk-test"
)
@patch(
"formfyxer.lit_explorer.rename_pdf_fields_with_context",
return_value={
Expand Down Expand Up @@ -131,7 +133,9 @@ def test_parse_form_rewrite_uses_rename_helper_for_nested_fields(
_mock_is_tagged,
_mock_needs_calculations,
):
fake_pdf = SimpleNamespace(pages=[object()], docinfo=SimpleNamespace(Title="Form"))
fake_pdf = SimpleNamespace(
pages=[object()], docinfo=SimpleNamespace(Title="Form")
)
mock_pikepdf_open.return_value = fake_pdf

with patch(
Expand Down Expand Up @@ -174,7 +178,9 @@ def test_rewrite_pdf_fields_in_place_preserves_order_and_full_flat_names(
]
fake_pdf = SimpleNamespace(
Root=SimpleNamespace(
AcroForm=SimpleNamespace(Fields=[object(), object(), object(), object()])
AcroForm=SimpleNamespace(
Fields=[object(), object(), object(), object()]
)
),
save=Mock(),
close=Mock(),
Expand Down
48 changes: 45 additions & 3 deletions formfyxer/tests/test_pdf_labeling_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,7 @@ def test_copy_pdf_fields_anchor_adjusts_rectangles(self, mock_get_transforms):
c.showPage()
c.save()

mock_get_transforms.return_value = [
None
]
mock_get_transforms.return_value = [None]

with pikepdf.Pdf.open(str(source_path)) as source_pdf, pikepdf.Pdf.open(
str(destination_path), allow_overwriting_input=True
Expand Down Expand Up @@ -308,6 +306,50 @@ def test_get_existing_pdf_fields_uses_overlap_avoidance_for_unmapped_fields(self
base_path.unlink(missing_ok=True)
patched_path.unlink(missing_ok=True)

def test_get_existing_pdf_fields_keeps_named_parent_widget_fields(self):
with NamedTemporaryFile(suffix=".pdf", delete=False) as base_tmp:
base_path = Path(base_tmp.name)
with NamedTemporaryFile(suffix=".pdf", delete=False) as patched_tmp:
patched_path = Path(patched_tmp.name)

try:
c = canvas.Canvas(str(base_path))
c.drawString(72, 720, "Page 1")
c.save()

with pikepdf.Pdf.open(str(base_path), allow_overwriting_input=True) as pdf:
page_obj = pdf.pages[0].obj
widget = pikepdf.Dictionary(
Type=pikepdf.Name("/Annot"),
Subtype=pikepdf.Name("/Widget"),
Rect=pikepdf.Array([72, 650, 212, 670]),
F=4,
P=page_obj,
)
widget_ref = pdf.make_indirect(widget)
parent = pikepdf.Dictionary(
FT=pikepdf.Name("/Tx"),
T=pikepdf.String("recipient_org"),
Kids=pikepdf.Array([widget_ref]),
)
parent_ref = pdf.make_indirect(parent)
widget.Parent = parent_ref
page_obj.Annots = pikepdf.Array([widget_ref])
pdf.Root.AcroForm = pikepdf.Dictionary(
Fields=pikepdf.Array([parent_ref])
)
pdf.save(str(patched_path))

loaded_fields = get_existing_pdf_fields(str(patched_path))
self.assertEqual(len(loaded_fields), 1)
self.assertEqual(len(loaded_fields[0]), 1)
self.assertEqual(loaded_fields[0][0].name, "recipient_org")
self.assertEqual(loaded_fields[0][0].x, 72)
self.assertEqual(loaded_fields[0][0].configs["width"], 140.0)
finally:
base_path.unlink(missing_ok=True)
patched_path.unlink(missing_ok=True)


if __name__ == "__main__":
unittest.main()
Loading