diff --git a/object.go b/object.go index 6b3d61b..b65acc2 100644 --- a/object.go +++ b/object.go @@ -318,3 +318,11 @@ type DocInfo struct { ModDate time.Time Custom map[string]string } + +// EmbeddedFile is one entry from the catalog's EmbeddedFiles name tree (a PDF +// attachment). Spec is the /Filespec dictionary; its /EF stream holds the +// bytes. +type EmbeddedFile struct { + Name string + Spec *Dict +} diff --git a/reader.go b/reader.go index 787b0ea..4e799f4 100644 --- a/reader.go +++ b/reader.go @@ -464,3 +464,59 @@ func (r *Reader) DecodeStream(obj Object) ([]byte, error) { } return s.Content() } + +const maxNameTreeDepth = 1000 + +// EmbeddedFiles returns the document's embedded files (PDF attachments) from +// the catalog's EmbeddedFiles name tree, in tree order. Returns nil when there +// are none. +func (r *Reader) EmbeddedFiles() []EmbeddedFile { + cat, err := r.Catalog() + if err != nil { + return nil + } + names, ok := cat.Dict("Names") + if !ok { + return nil + } + root, ok := names.Dict("EmbeddedFiles") + if !ok { + return nil + } + var out []EmbeddedFile + r.walkNameTree(root, map[Reference]struct{}{}, 0, &out) + return out +} + +// walkNameTree collects (name, /Filespec) pairs from a name-tree node. seen +// records already-visited /Kids references and depth bounds the descent, so a +// cyclic or pathologically deep /Kids graph can't loop or overflow the stack. +func (r *Reader) walkNameTree(node *Dict, seen map[Reference]struct{}, depth int, out *[]EmbeddedFile) { + if node == nil || depth > maxNameTreeDepth { + return + } + if kids, ok := node.Array("Kids"); ok { + for _, kid := range kids { + if ref, ok := kid.(Reference); ok { + if _, dup := seen[ref]; dup { + continue + } + seen[ref] = struct{}{} + } + if child, err := r.ResolveDict(kid); err == nil { + r.walkNameTree(child, seen, depth+1, out) + } + } + } + if entries, ok := node.Array("Names"); ok { + for i := 0; i+1 < len(entries); i += 2 { + name, ok := entries[i].(String) + if !ok { + continue + } + if spec, err := r.ResolveDict(entries[i+1]); err == nil { + *out = append(*out, EmbeddedFile{Name: string(name), Spec: spec}) + } + } + } +} diff --git a/reader_test.go b/reader_test.go index e556248..b330de5 100644 --- a/reader_test.go +++ b/reader_test.go @@ -247,3 +247,99 @@ func TestParseDate(t *testing.T) { t.Fatalf("hour %d", d.Hour()) } } + +// buildDictPDF puts each body in objs as object i+1 of a classical-xref PDF +// (obj 1 is the catalog). Bodies are plain objects (no streams). +func buildDictPDF(t *testing.T, objs []string) []byte { + t.Helper() + var buf bytes.Buffer + fmt.Fprint(&buf, "%PDF-1.7\n%\xE2\xE3\xCF\xD3\n") + offsets := make([]int, len(objs)+1) + for i, body := range objs { + offsets[i+1] = buf.Len() + fmt.Fprintf(&buf, "%d 0 obj\n%s\nendobj\n", i+1, body) + } + xrefOff := buf.Len() + fmt.Fprintf(&buf, "xref\n0 %d\n%010d %05d f \n", len(objs)+1, 0, 65535) + for i := 1; i <= len(objs); i++ { + fmt.Fprintf(&buf, "%010d %05d n \n", offsets[i], 0) + } + fmt.Fprintf(&buf, "trailer\n<< /Size %d /Root 1 0 R >>\nstartxref\n%d\n%%%%EOF\n", + len(objs)+1, xrefOff) + return buf.Bytes() +} + +func TestEmbeddedFiles(t *testing.T) { + data := buildDictPDF(t, []string{ + "<< /Type /Catalog /Pages 2 0 R /Names << /EmbeddedFiles 3 0 R >> >>", + "<< /Type /Pages /Kids [] /Count 0 >>", + "<< /Names [ (a.xml) 4 0 R (b.xml) 5 0 R ] >>", + "<< /Type /Filespec /F (a.xml) >>", + "<< /Type /Filespec /F (b.xml) >>", + }) + r, err := Open(bytes.NewReader(data)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer r.Close() + ef := r.EmbeddedFiles() + if len(ef) != 2 { + t.Fatalf("got %d files, want 2", len(ef)) + } + if ef[0].Name != "a.xml" || ef[1].Name != "b.xml" { + t.Fatalf("names %q, %q", ef[0].Name, ef[1].Name) + } + if f, ok := ef[0].Spec.String("F"); !ok || f != "a.xml" { + t.Fatalf("spec /F %q ok=%v", f, ok) + } +} + +func TestEmbeddedFilesNestedKids(t *testing.T) { + data := buildDictPDF(t, []string{ + "<< /Type /Catalog /Pages 2 0 R /Names << /EmbeddedFiles 3 0 R >> >>", + "<< /Type /Pages /Kids [] /Count 0 >>", + "<< /Kids [ 4 0 R ] >>", + "<< /Names [ (a.xml) 5 0 R ] >>", + "<< /Type /Filespec /F (a.xml) >>", + }) + r, err := Open(bytes.NewReader(data)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer r.Close() + if ef := r.EmbeddedFiles(); len(ef) != 1 || ef[0].Name != "a.xml" { + t.Fatalf("got %+v, want one a.xml", ef) + } +} + +func TestEmbeddedFilesCyclicKidsTerminates(t *testing.T) { + // obj 3's /Kids references itself; the walk must terminate, not overflow. + data := buildDictPDF(t, []string{ + "<< /Type /Catalog /Pages 2 0 R /Names << /EmbeddedFiles 3 0 R >> >>", + "<< /Type /Pages /Kids [] /Count 0 >>", + "<< /Kids [ 3 0 R ] >>", + }) + r, err := Open(bytes.NewReader(data)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer r.Close() + if ef := r.EmbeddedFiles(); len(ef) != 0 { + t.Fatalf("got %d files, want 0", len(ef)) + } +} + +func TestEmbeddedFilesNone(t *testing.T) { + data := buildDictPDF(t, []string{ + "<< /Type /Catalog /Pages 2 0 R >>", + "<< /Type /Pages /Kids [] /Count 0 >>", + }) + r, err := Open(bytes.NewReader(data)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer r.Close() + if ef := r.EmbeddedFiles(); ef != nil { + t.Fatalf("got %+v, want nil", ef) + } +}