From 9ae9960fe6c769317b060a717d62eb5f19ef67f5 Mon Sep 17 00:00:00 2001
From: Alan Zimmerman <alan.zimm@gmail.com>
Date: Wed, 7 Jun 2023 23:05:07 +0100
Subject: [PATCH 1/3] Brute force process/ignore of inline images.

These occur as

BI
  params
ID
  image content
EI

So when we see the ID, skip forward until we see an "EI" in the text.

This is the way it is done in the rust pdf library
https://github.com/pdf-rs/pdf/blob/677152fa8e84a2dcfbc3d927535148bb8d0369ba/pdf/src/content.rs#L127
---
 .gitignore                        | 2 ++
 cabal.project                     | 2 +-
 content/lib/Pdf/Content/Parser.hs | 8 ++++++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 99ad5fb..f5146a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,5 @@ tmp/
 cabal.sandbox.config
 dist-newstyle/
 .ghc.environment.*
+/.envrc
+/cabal.project.local
diff --git a/cabal.project b/cabal.project
index cc0b481..eecd476 100644
--- a/cabal.project
+++ b/cabal.project
@@ -2,5 +2,5 @@ packages:
   core/
   content/
   document/
-  viewer/
+  -- viewer/
   examples/
diff --git a/content/lib/Pdf/Content/Parser.hs b/content/lib/Pdf/Content/Parser.hs
index f386491..c6403b8 100644
--- a/content/lib/Pdf/Content/Parser.hs
+++ b/content/lib/Pdf/Content/Parser.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE OverloadedStrings #-}
 
 -- | Parse content stream
 
@@ -43,8 +44,15 @@ parseContent = do
   skipSpace
   (Parser.endOfInput >> return Nothing) <|>
     fmap Just (fmap Obj parseObject <|>
+              parseInlineImage <|>
                fmap (Op . toOp) (Parser.takeWhile1 isRegularChar))
 
+parseInlineImage :: Parser Expr
+parseInlineImage = do
+  Parser.string "ID"
+  Parser.manyTill Parser.anyChar (Parser.string "EI")
+  return $ Op Op_EI
+
 -- Treat comments as spaces
 skipSpace :: Parser ()
 skipSpace = do

From e1b0d35e54b2bebfae1b86d99072bff581baa0b9 Mon Sep 17 00:00:00 2001
From: Alan Zimmerman <alan.zimm@gmail.com>
Date: Wed, 7 Jun 2023 23:09:12 +0100
Subject: [PATCH 2/3] Return all the operations for a page.

This allows simple processing of things like generated bank
statements.
---
 content/lib/Pdf/Content/Processor.hs |  8 +++--
 document/lib/Pdf/Document/Page.hs    | 51 +++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/content/lib/Pdf/Content/Processor.hs b/content/lib/Pdf/Content/Processor.hs
index 76b433c..a73f484 100644
--- a/content/lib/Pdf/Content/Processor.hs
+++ b/content/lib/Pdf/Content/Processor.hs
@@ -89,15 +89,16 @@ initialGraphicsState = GraphicsState {
 data Span = Span
   { spGlyphs :: [Glyph]
   , spFontName :: Name
-  }
+  } deriving Show
 
 -- | Processor maintains graphics state
 data Processor = Processor {
   prState :: GraphicsState,
   prStateStack :: [GraphicsState],
   prGlyphDecoder :: GlyphDecoder,
-  prSpans :: [Span]
+  prSpans :: [Span],
   -- ^ Each element is a list of glyphs, drawn in one shot
+  prOperators :: [Operator]
   }
 
 -- | Create processor in initial state
@@ -106,7 +107,8 @@ mkProcessor = Processor {
   prState = initialGraphicsState,
   prStateStack = [],
   prGlyphDecoder = \_ _ -> [],
-  prSpans = mempty
+  prSpans = mempty,
+  prOperators = mempty
   }
 
 -- | Process one operation
diff --git a/document/lib/Pdf/Document/Page.hs b/document/lib/Pdf/Document/Page.hs
index 01bc39b..5c2abc8 100644
--- a/document/lib/Pdf/Document/Page.hs
+++ b/document/lib/Pdf/Document/Page.hs
@@ -11,7 +11,8 @@ module Pdf.Document.Page
   pageFontDicts,
   pageExtractText,
   pageExtractGlyphs,
-  glyphsToText
+  glyphsToText,
+  pageExtractOperators
 )
 where
 
@@ -249,6 +250,54 @@ pageExtractGlyphs page = do
     }
   return (List.reverse (prSpans p))
 
+pageExtractOperators :: Page -> IO [Operator]
+pageExtractOperators page = do
+  fontDicts <- Map.fromList <$> pageFontDicts page
+  glyphDecoders <- Traversable.forM fontDicts $ \fontDict ->
+    fontInfoDecodeGlyphs <$> fontDictLoadInfo fontDict
+  let glyphDecoder fontName = \str ->
+        case Map.lookup fontName glyphDecoders of
+          Nothing -> []
+          Just decode -> decode str
+
+  xobjects <- pageXObjects page
+
+  is <- do
+    contents <- pageContents page
+    let Page pdf _ _ = page
+    is <- combinedContent pdf contents
+    Streams.parserToInputStream parseContent is
+
+  -- use content stream processor to extract text
+  let loop xobjs s p = do
+        next <- readNextOperator s
+        case next of
+          Just (Op_Do, [Name name]) -> processDo xobjs name p >>= loop xobjs s
+          Just op -> do
+            let p' = p { prOperators = op : prOperators p }
+            case processOp op p' of
+              Left err -> throwIO (Unexpected err [])
+              Right  p' -> loop xobjs s p'
+          Nothing -> return p
+
+      processDo xobjs name p = do
+        case Map.lookup name xobjs of
+          Nothing -> return p
+          Just xobj -> do
+            s <- do
+              s <- Streams.fromLazyByteString (xobjectContent xobj)
+              Streams.parserToInputStream parseContent s
+
+            let gdec' = prGlyphDecoder p
+            p' <- loop (xobjectChildren xobj) s
+              (p {prGlyphDecoder = xobjectGlyphDecoder xobj})
+            return (p' {prGlyphDecoder = gdec'})
+
+  p <- loop xobjects is $ mkProcessor {
+    prGlyphDecoder = glyphDecoder
+    }
+  return (List.reverse (prOperators p))
+
 combinedContent :: Pdf -> [Ref] -> IO (InputStream ByteString)
 combinedContent pdf refs = do
   allStreams <- forM refs $ \ref -> do

From 17d2e3e1d2f697e0df21982f70b842811d02fee3 Mon Sep 17 00:00:00 2001
From: Alan Zimmerman <alan.zimm@gmail.com>
Date: Tue, 13 Jun 2023 23:07:14 +0100
Subject: [PATCH 3/3] Export Pdf.Content.Ops.Object(..)

---
 content/lib/Pdf/Content/Ops.hs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/content/lib/Pdf/Content/Ops.hs b/content/lib/Pdf/Content/Ops.hs
index e208a65..46b0cc9 100644
--- a/content/lib/Pdf/Content/Ops.hs
+++ b/content/lib/Pdf/Content/Ops.hs
@@ -7,6 +7,7 @@ module Pdf.Content.Ops
   Op(..),
   Expr(..),
   Operator,
+  Object(..),
   toOp
 )
 where