Skip to content

benedoc-inc/pdfer

Repository files navigation

pdfer

Pure Go PDF processing library — zero CGO, zero external dependencies.

Go Reference Go Report Card

Installation

go get github.com/benedoc-inc/pdfer

Quick start

import "github.com/benedoc-inc/pdfer"

// Merge two PDFs
out, err := pdfer.MergePDFs([][]byte{a, b}, nil, false)

// Split into page ranges
parts, err := pdfer.SplitPDF(pdfBytes, []pdfer.PageRange{{1, 3}, {4, 6}}, nil, false)

// Fill a form and flatten it
form, err := pdfer.ExtractForm(pdfBytes, nil, false)
filled, err := form.Fill(pdfBytes, pdfer.FormData{"name": "Alice"}, nil, false)
flat, err := pdfer.FlattenForm(filled, nil, false)

API reference

All operations are available from the root pdfer package. Import sub-packages only for lower-level control.

Encryption

out, err := pdfer.EncryptPDF(pdfBytes, []byte("user-pw"), []byte("owner-pw"), false)
out, err := pdfer.DecryptPDF(pdfBytes, []byte("password"), false)
perms, err := pdfer.GetPermissions(pdfBytes, []byte("password"))
// perms.Print, perms.Modify, perms.Copy, perms.AddAnnotations, …

Page operations

// Extract, delete, reorder
out, err := pdfer.ExtractPages(pdfBytes, []int{1, 3, 5}, nil, false)
out, err := pdfer.DeletePage(pdfBytes, 2, nil, false)
out, err := pdfer.DeletePages(pdfBytes, []int{2, 4}, nil, false)
out, err := pdfer.ReorderPages(pdfBytes, []int{3, 1, 2}, nil, false)

// Insert and duplicate
out, err := pdfer.InsertBlankPage(pdfBytes, 2, 612, 792, nil, false) // position, width, height (pts)
out, err := pdfer.DuplicatePage(pdfBytes, 1, 2, nil, false)          // page, copies

// Geometry
out, err := pdfer.RotatePage(pdfBytes, 1, 90, nil, false)   // angle: 90, 180, or 270
out, err := pdfer.RotateAllPages(pdfBytes, 180, nil, false)
out, err := pdfer.CropPage(pdfBytes, 1, [4]float64{36, 36, 576, 756}, nil, false) // [llx lly urx ury]
out, err := pdfer.SetPageSize(pdfBytes, 1, 612, 792, nil, false)                  // width, height (pts)

Document operations

out, err := pdfer.MergePDFs([][]byte{a, b, c}, nil, false)
parts, err := pdfer.SplitPDF(pdfBytes, []pdfer.PageRange{{1, 3}, {4, 6}}, nil, false)
parts, err := pdfer.SplitPDFByPageCount(pdfBytes, 10, nil, false)
out, err := pdfer.Redact(pdfBytes, []pdfer.RedactBox{{Page: 1, Rect: [4]float64{50, 680, 200, 720}}}, nil)
out, err := pdfer.Repair(pdfBytes, nil)
out, err := pdfer.Linearize(pdfBytes, nil) // Fast Web View

Stamping

out, err := pdfer.StampText(pdfBytes, 1, pdfer.TextStamp{
    Text: "CONFIDENTIAL", FontSize: 14, X: 72, Y: 720, R: 1,
}, nil, false)
out, err := pdfer.StampAllPages(pdfBytes, pdfer.TextStamp{Text: "DRAFT", X: 72, Y: 36}, nil, false)
out, err := pdfer.StampPageNumbers(pdfBytes, pdfer.PageNumberOptions{
    Position: pdfer.BottomCenter, FontSize: 10,
}, nil, false)

Metadata

meta, err := pdfer.GetMetadata(pdfBytes, nil, false)
// meta.Title, meta.Author, meta.CreationDate, meta.PageCount, …

out, err := pdfer.SetMetadata(pdfBytes, pdfer.MetadataUpdate{
    Title:  "Annual Report",
    Author: "Alice",
}, nil, false)

out, err := pdfer.RedactMetadata(pdfBytes, nil, false) // strips /Info and XMP

Annotations

// Link to URL
out, err := pdfer.AddAnnotation(pdfBytes, 1, pdfer.AnnotationConfig{
    Type: pdfer.AnnotLink,
    Rect: [4]float64{72, 700, 200, 720},
    URI:  "https://example.com",
}, nil, false)

// Internal page link
out, err := pdfer.AddAnnotation(pdfBytes, 1, pdfer.AnnotationConfig{
    Type:     pdfer.AnnotLink,
    Rect:     [4]float64{72, 680, 200, 700},
    DestPage: 3,
}, nil, false)

// Text note, highlight, free-text, underline, strikeout also supported
out, err := pdfer.AddAnnotation(pdfBytes, 1, pdfer.AnnotationConfig{
    Type:     pdfer.AnnotHighlight,
    Rect:     [4]float64{72, 650, 300, 665},
    Contents: "Important passage",
    Color:    [3]float64{1, 1, 0}, // yellow
}, nil, false)

Bookmarks

bmarks, err := pdfer.GetBookmarks(pdfBytes, nil, false)

out, err := pdfer.SetBookmarks(pdfBytes, []pdfer.BookmarkEntry{
    {Title: "Introduction", Page: 1},
    {Title: "Chapter 1", Page: 3, Children: []pdfer.BookmarkEntry{
        {Title: "Background", Page: 3},
        {Title: "Methods",    Page: 7},
    }},
    {Title: "Appendix", Page: 42},
}, nil, false)

Digital signatures

// Sign
out, err := pdfer.SignPDF(pdfBytes, pdfer.SignOptions{
    Certificate: cert,   // *x509.Certificate
    PrivateKey:  key,    // crypto.Signer
    Reason:      "Approved",
    Location:    "New York",
})

// Validate
sigs, err := pdfer.ValidateSignatures(pdfBytes)
for _, s := range sigs {
    fmt.Printf("%s: valid=%v signer=%s\n", s.FieldName, s.Valid, s.SignerName)
}

Forms (AcroForm and XFA)

// Auto-detect form type
kind, err := pdfer.DetectForm(pdfBytes, nil, false) // "acroform", "xfa", or "unknown"

// Extract and fill
form, err := pdfer.ExtractForm(pdfBytes, nil, false)
schema := form.Schema()
filled, err := form.Fill(pdfBytes, pdfer.FormData{"FirstName": "Alice"}, nil, false)

// Flatten (make non-interactive)
out, err := pdfer.FlattenForm(filled, nil, false)

Content extraction

// Full structured extraction
doc, err := pdfer.ExtractContent(pdfBytes, nil, false)
// doc.Pages[0].Text, doc.Pages[0].Images, doc.Pages[0].Annotations, doc.Bookmarks, …

json, err := pdfer.ExtractContentToJSON(pdfBytes, nil, false)

// Images only
imgs, err := pdfer.ExtractAllImages(pdfBytes, nil, false)
// imgs[0].Data (raw bytes), imgs[0].Width, imgs[0].Height, imgs[0].Format

// Dump everything to disk
out, err := pdfer.ExtractToDirectory(pdfBytes, nil, "/tmp/extracted", false)

Comparison

result, err := pdfer.ComparePDFs(pdf1, pdf2, nil, nil, false)
fmt.Println(pdfer.CompareReport(result))

// With options
opts := pdfer.DefaultCompareOptions()
opts.IgnoreDates = true
result, err := pdfer.ComparePDFsWithOptions(pdf1, pdf2, nil, nil, opts)

Image replacement

// Replace an image by resource name or object number
out, err := pdfer.ReplaceImage(pdfBytes, "Im1", jpegBytes, "jpeg", nil, false)
out, err := pdfer.ReplaceImage(pdfBytes, "Im1", pngBytes,  "png",  nil, false)

PDF/A conversion and validation

// Convert to PDF/A (decrypts first if needed)
out, err := pdfer.ConvertToPDFA(pdfBytes, nil, "1b") // "1b", "2b", or "3b"

// Validate conformance
vr := pdfer.ValidatePDFA(pdfBytes)
if !vr.Conformant {
    for _, v := range vr.Violations {
        fmt.Println(v.Code, v.Message)
    }
}

Creating PDFs from scratch

import "github.com/benedoc-inc/pdfer/core/write"

builder := write.NewSimplePDFBuilder()
page := builder.AddPage(write.PageSizeLetter)

font := page.AddStandardFont("Helvetica")
page.Content().
    BeginText().
    SetFont(font, 24).
    SetTextPosition(72, 720).
    ShowText("Hello, World!").
    EndText().
    SetFillColorRGB(0.9, 0.2, 0.2).
    Rectangle(72, 660, 200, 40).
    Fill()

builder.FinalizePage(page)
pdfBytes, err := builder.Bytes()

Parsing PDFs directly

import "github.com/benedoc-inc/pdfer/core/parse"

pdf, err := parse.OpenWithOptions(pdfBytes, parse.ParseOptions{
    Password: []byte("secret"),
    Verbose:  false,
})

fmt.Println(pdf.Version(), pdf.ObjectCount(), pdf.IsEncrypted())
obj, err := pdf.GetObject(5)

Package layout

pdfer/
├── pdfer.go / api.go   — root package (start here)
├── core/
│   ├── parse/          — PDF structure parsing
│   ├── write/          — PDF generation and PDF/A validation
│   ├── encrypt/        — RC4/AES encryption primitives
│   ├── manipulate/     — all document-level operations
│   ├── sign/           — digital signature creation and validation
│   └── compare/        — structured PDF diffing
├── forms/
│   ├── acroform/       — AcroForm parsing, filling, flattening
│   └── xfa/            — XFA stream extraction and dataset updating
├── content/extract/    — text, image, annotation, bookmark extraction
├── resources/font/     — TrueType/OpenType font embedding
└── types/              — shared data structures

Feature matrix

Category Feature Status
Encryption RC4 40/128-bit, AES 128/256-bit read
AES-128 write
Owner-password auth (R≤4)
Permission flags
Page ops Merge, split, extract, delete
Reorder, insert blank, duplicate
Rotate, crop, resize
Content Stamp text / page numbers
Redact content streams, annotations, image XObjects
Redact XMP/Info metadata call RedactMetadata separately
Linearize (Fast Web View)
Repair / rebuild
Metadata Read /Info + XMP
Write /Info
Strip metadata (privacy)
Annotations Link (URI + internal), Text, FreeText
Highlight, Underline, StrikeOut
Bookmarks Read and write outline tree
Signatures PKCS#7 / CMS detached signing
Signature validation (RSA + ECDSA)
Visible signature field appearance
RFC 3161 timestamp (TSA)
Long-term validation (LTV / OCSP / CRL)
Forms AcroForm parse, fill, flatten
XFA extract, fill, rebuild
Extraction Text, graphics, images, fonts
Annotations, bookmarks, metadata
Table detection from graphic grid lines
JSON serialization, directory dump
Text search / find-and-highlight
JPEG2000 (JPXDecode) decode
JBIG2 decode
Comparison Structural + text + image diff
PDF/A Conformance validation (heuristic, parts 1–3)
Conversion of arbitrary PDFs
Images Replace image XObject (JPEG/PNG/raw)
Parsing xref tables + streams (PDF 1.5+)
Object streams + Type-2 xref entries
Incremental updates

Known limitations

See GAPS.md for the full history and detailed file pointers.

Redaction

  • Redact clears content streams, annotation objects, and image XObjects within the specified boxes. XMP metadata and /Info entries are not cleared — call RedactMetadata separately for document-level metadata.

Digital signatures

  • Signatures are always invisible (no rendered appearance box).
  • No RFC 3161 timestamps — signatures become unverifiable after the signing certificate expires.
  • No long-term validation (no embedded OCSP responses or CRL data).

Forms

  • Form.Validate() returns "not implemented" for XFA forms — structural extraction only.
  • Calculated form fields are not re-evaluated on Fill(); dependent fields remain stale until opened in a viewer.
  • XFA script parsing handles common patterns (visibility, set-value, validate, calculate) and falls back to ActionTypeExecute for scripts it cannot classify.

Images / encoding

  • JPEG2000 (JPXDecode) and JBIG2 image streams are detected but not decoded.
  • CMYK images are returned with raw CMYK bytes; callers must convert to RGB.

Other

  • Linearize does not emit a /H hint stream — object ordering is correct but byte-serving is not optimised.
  • StampText emits a single Tj operator; text is not wrapped across lines.
  • Optional Content Groups (PDF layers) are not accessible via the API.
  • Named destinations and embedded file attachments are not exposed.
  • PDF/A validation is heuristic — it misses font subset tags, transparency groups, overprint settings, and annotation appearance requirements.

Testing

go test ./...

License

MIT — see LICENSE for details.

About

A pure Go library for PDF processing with comprehensive XFA (XML Forms Architecture) support

Topics

Resources

License

Contributing

Stars

Watchers

Forks

Packages

 
 
 

Contributors