From 61876c50f7249dc9f7a3a594a8bc4aa10c50c5e7 Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Wed, 1 Apr 2026 16:32:12 -0400 Subject: [PATCH] Fix issue where form fields can be skipped when flag is missing in parent object; pikepdf handles this fine but we were skipping valid form fields --- formfyxer/lit_explorer.py | 8 +++- formfyxer/pdf_wrangling.py | 36 ++++++++------ .../tests/test_lit_explorer_pdf_labeling.py | 12 +++-- formfyxer/tests/test_pdf_labeling_rules.py | 48 +++++++++++++++++-- 4 files changed, 81 insertions(+), 23 deletions(-) diff --git a/formfyxer/lit_explorer.py b/formfyxer/lit_explorer.py index a01c077..7869787 100644 --- a/formfyxer/lit_explorer.py +++ b/formfyxer/lit_explorer.py @@ -149,7 +149,9 @@ def _rewrite_pdf_fields_in_place( "PDF field traversal count did not match parsed field-name count" ) - for field_data, old_name, new_name in zip(flattened_fields, field_names, new_names): + for field_data, old_name, new_name in zip( + flattened_fields, field_names, new_names + ): cleaned_name = re.sub(r"^\*", "", new_name) if old_name == cleaned_name: continue @@ -159,7 +161,9 @@ def _rewrite_pdf_fields_in_place( continue # Nested fields keep their parent hierarchy; flat fields accept full dotted names. - target_name = cleaned_name.split(".")[-1] if "." in old_name else cleaned_name + target_name = ( + cleaned_name.split(".")[-1] if "." in old_name else cleaned_name + ) target.T = target_name my_pdf.save(in_file) diff --git a/formfyxer/pdf_wrangling.py b/formfyxer/pdf_wrangling.py index 8fe0d97..12ec6d6 100644 --- a/formfyxer/pdf_wrangling.py +++ b/formfyxer/pdf_wrangling.py @@ -468,31 +468,37 @@ def _unnest_pdf_fields( parent_name = [] if hasattr(field, "T"): parent_name.append(str(field.T)) - if hasattr(field, "FT") and hasattr(field, "F"): - # PDF fields have bit flags for specific options. The 17th bit (or hex - # 10000) on Buttons mark a "push button", w/o a permanent value - # (e.g. "Print this PDF") They aren't really fields, just skip them. - if hasattr(field, "Ff") and field.FT == "/Btn" and bool(field.Ff & 0x10000): - return [] - return [{"type": field.FT, "var_name": ".".join(parent_name), "all": field}] - elif parent_type and parent_flags and hasattr(field, "F"): - if parent_flags and parent_type == "/Btn" and bool(parent_flags & 0x10000): - return [] - return [{"type": parent_type, "var_name": ".".join(parent_name), "all": field}] - elif hasattr(field, "Kids"): + effective_type = str(field.FT) if hasattr(field, "FT") else parent_type + effective_flags = int(field.Ff) if hasattr(field, "Ff") else parent_flags + + if hasattr(field, "Kids"): return [ y for x in field.Kids for y in _unnest_pdf_fields( x, copy(parent_name), - str(field.FT) if hasattr(field, "FT") else None, - int(field.Ff) if hasattr(field, "Ff") else None, + effective_type, + effective_flags, ) ] - else: + + if not effective_type: + return [] + + # PDF fields have bit flags for specific options. The 17th bit (or hex + # 10000) on Buttons mark a "push button", w/o a permanent value + # (e.g. "Print this PDF"). They aren't really fields, just skip them. + if effective_type == "/Btn" and bool((effective_flags or 0) & 0x10000): return [] + if hasattr(field, "FT") or hasattr(field, "F"): + return [ + {"type": effective_type, "var_name": ".".join(parent_name), "all": field} + ] + + return [] + def has_fields(pdf_file: str) -> bool: """ diff --git a/formfyxer/tests/test_lit_explorer_pdf_labeling.py b/formfyxer/tests/test_lit_explorer_pdf_labeling.py index f9e0fb4..db90987 100644 --- a/formfyxer/tests/test_lit_explorer_pdf_labeling.py +++ b/formfyxer/tests/test_lit_explorer_pdf_labeling.py @@ -99,7 +99,9 @@ def test_parse_form_accepts_model_and_openai_base_url(self): ) @patch("formfyxer.lit_explorer.unlock_pdf_in_place") @patch("formfyxer.lit_explorer.pikepdf.open") - @patch("formfyxer.lit_explorer.get_openai_api_key_from_sources", return_value="sk-test") + @patch( + "formfyxer.lit_explorer.get_openai_api_key_from_sources", return_value="sk-test" + ) @patch( "formfyxer.lit_explorer.rename_pdf_fields_with_context", return_value={ @@ -131,7 +133,9 @@ def test_parse_form_rewrite_uses_rename_helper_for_nested_fields( _mock_is_tagged, _mock_needs_calculations, ): - fake_pdf = SimpleNamespace(pages=[object()], docinfo=SimpleNamespace(Title="Form")) + fake_pdf = SimpleNamespace( + pages=[object()], docinfo=SimpleNamespace(Title="Form") + ) mock_pikepdf_open.return_value = fake_pdf with patch( @@ -174,7 +178,9 @@ def test_rewrite_pdf_fields_in_place_preserves_order_and_full_flat_names( ] fake_pdf = SimpleNamespace( Root=SimpleNamespace( - AcroForm=SimpleNamespace(Fields=[object(), object(), object(), object()]) + AcroForm=SimpleNamespace( + Fields=[object(), object(), object(), object()] + ) ), save=Mock(), close=Mock(), diff --git a/formfyxer/tests/test_pdf_labeling_rules.py b/formfyxer/tests/test_pdf_labeling_rules.py index d02612a..7e27f89 100644 --- a/formfyxer/tests/test_pdf_labeling_rules.py +++ b/formfyxer/tests/test_pdf_labeling_rules.py @@ -117,9 +117,7 @@ def test_copy_pdf_fields_anchor_adjusts_rectangles(self, mock_get_transforms): c.showPage() c.save() - mock_get_transforms.return_value = [ - None - ] + mock_get_transforms.return_value = [None] with pikepdf.Pdf.open(str(source_path)) as source_pdf, pikepdf.Pdf.open( str(destination_path), allow_overwriting_input=True @@ -308,6 +306,50 @@ def test_get_existing_pdf_fields_uses_overlap_avoidance_for_unmapped_fields(self base_path.unlink(missing_ok=True) patched_path.unlink(missing_ok=True) + def test_get_existing_pdf_fields_keeps_named_parent_widget_fields(self): + with NamedTemporaryFile(suffix=".pdf", delete=False) as base_tmp: + base_path = Path(base_tmp.name) + with NamedTemporaryFile(suffix=".pdf", delete=False) as patched_tmp: + patched_path = Path(patched_tmp.name) + + try: + c = canvas.Canvas(str(base_path)) + c.drawString(72, 720, "Page 1") + c.save() + + with pikepdf.Pdf.open(str(base_path), allow_overwriting_input=True) as pdf: + page_obj = pdf.pages[0].obj + widget = pikepdf.Dictionary( + Type=pikepdf.Name("/Annot"), + Subtype=pikepdf.Name("/Widget"), + Rect=pikepdf.Array([72, 650, 212, 670]), + F=4, + P=page_obj, + ) + widget_ref = pdf.make_indirect(widget) + parent = pikepdf.Dictionary( + FT=pikepdf.Name("/Tx"), + T=pikepdf.String("recipient_org"), + Kids=pikepdf.Array([widget_ref]), + ) + parent_ref = pdf.make_indirect(parent) + widget.Parent = parent_ref + page_obj.Annots = pikepdf.Array([widget_ref]) + pdf.Root.AcroForm = pikepdf.Dictionary( + Fields=pikepdf.Array([parent_ref]) + ) + pdf.save(str(patched_path)) + + loaded_fields = get_existing_pdf_fields(str(patched_path)) + self.assertEqual(len(loaded_fields), 1) + self.assertEqual(len(loaded_fields[0]), 1) + self.assertEqual(loaded_fields[0][0].name, "recipient_org") + self.assertEqual(loaded_fields[0][0].x, 72) + self.assertEqual(loaded_fields[0][0].configs["width"], 140.0) + finally: + base_path.unlink(missing_ok=True) + patched_path.unlink(missing_ok=True) + if __name__ == "__main__": unittest.main()