From 9ae9960fe6c769317b060a717d62eb5f19ef67f5 Mon Sep 17 00:00:00 2001 From: Alan Zimmerman Date: Wed, 7 Jun 2023 23:05:07 +0100 Subject: [PATCH 1/3] Brute force process/ignore of inline images. These occur as BI params ID image content EI So when we see the ID, skip forward until we see an "EI" in the text. This is the way it is done in the rust pdf library https://github.com/pdf-rs/pdf/blob/677152fa8e84a2dcfbc3d927535148bb8d0369ba/pdf/src/content.rs#L127 --- .gitignore | 2 ++ cabal.project | 2 +- content/lib/Pdf/Content/Parser.hs | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 99ad5fb..f5146a6 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ tmp/ cabal.sandbox.config dist-newstyle/ .ghc.environment.* +/.envrc +/cabal.project.local diff --git a/cabal.project b/cabal.project index cc0b481..eecd476 100644 --- a/cabal.project +++ b/cabal.project @@ -2,5 +2,5 @@ packages: core/ content/ document/ - viewer/ + -- viewer/ examples/ diff --git a/content/lib/Pdf/Content/Parser.hs b/content/lib/Pdf/Content/Parser.hs index f386491..c6403b8 100644 --- a/content/lib/Pdf/Content/Parser.hs +++ b/content/lib/Pdf/Content/Parser.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} -- | Parse content stream @@ -43,8 +44,15 @@ parseContent = do skipSpace (Parser.endOfInput >> return Nothing) <|> fmap Just (fmap Obj parseObject <|> + parseInlineImage <|> fmap (Op . toOp) (Parser.takeWhile1 isRegularChar)) +parseInlineImage :: Parser Expr +parseInlineImage = do + Parser.string "ID" + Parser.manyTill Parser.anyChar (Parser.string "EI") + return $ Op Op_EI + -- Treat comments as spaces skipSpace :: Parser () skipSpace = do From e1b0d35e54b2bebfae1b86d99072bff581baa0b9 Mon Sep 17 00:00:00 2001 From: Alan Zimmerman Date: Wed, 7 Jun 2023 23:09:12 +0100 Subject: [PATCH 2/3] Return all the operations for a page. This allows simple processing of things like generated bank statements. --- content/lib/Pdf/Content/Processor.hs | 8 +++-- document/lib/Pdf/Document/Page.hs | 51 +++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/content/lib/Pdf/Content/Processor.hs b/content/lib/Pdf/Content/Processor.hs index 76b433c..a73f484 100644 --- a/content/lib/Pdf/Content/Processor.hs +++ b/content/lib/Pdf/Content/Processor.hs @@ -89,15 +89,16 @@ initialGraphicsState = GraphicsState { data Span = Span { spGlyphs :: [Glyph] , spFontName :: Name - } + } deriving Show -- | Processor maintains graphics state data Processor = Processor { prState :: GraphicsState, prStateStack :: [GraphicsState], prGlyphDecoder :: GlyphDecoder, - prSpans :: [Span] + prSpans :: [Span], -- ^ Each element is a list of glyphs, drawn in one shot + prOperators :: [Operator] } -- | Create processor in initial state @@ -106,7 +107,8 @@ mkProcessor = Processor { prState = initialGraphicsState, prStateStack = [], prGlyphDecoder = \_ _ -> [], - prSpans = mempty + prSpans = mempty, + prOperators = mempty } -- | Process one operation diff --git a/document/lib/Pdf/Document/Page.hs b/document/lib/Pdf/Document/Page.hs index 01bc39b..5c2abc8 100644 --- a/document/lib/Pdf/Document/Page.hs +++ b/document/lib/Pdf/Document/Page.hs @@ -11,7 +11,8 @@ module Pdf.Document.Page pageFontDicts, pageExtractText, pageExtractGlyphs, - glyphsToText + glyphsToText, + pageExtractOperators ) where @@ -249,6 +250,54 @@ pageExtractGlyphs page = do } return (List.reverse (prSpans p)) +pageExtractOperators :: Page -> IO [Operator] +pageExtractOperators page = do + fontDicts <- Map.fromList <$> pageFontDicts page + glyphDecoders <- Traversable.forM fontDicts $ \fontDict -> + fontInfoDecodeGlyphs <$> fontDictLoadInfo fontDict + let glyphDecoder fontName = \str -> + case Map.lookup fontName glyphDecoders of + Nothing -> [] + Just decode -> decode str + + xobjects <- pageXObjects page + + is <- do + contents <- pageContents page + let Page pdf _ _ = page + is <- combinedContent pdf contents + Streams.parserToInputStream parseContent is + + -- use content stream processor to extract text + let loop xobjs s p = do + next <- readNextOperator s + case next of + Just (Op_Do, [Name name]) -> processDo xobjs name p >>= loop xobjs s + Just op -> do + let p' = p { prOperators = op : prOperators p } + case processOp op p' of + Left err -> throwIO (Unexpected err []) + Right p' -> loop xobjs s p' + Nothing -> return p + + processDo xobjs name p = do + case Map.lookup name xobjs of + Nothing -> return p + Just xobj -> do + s <- do + s <- Streams.fromLazyByteString (xobjectContent xobj) + Streams.parserToInputStream parseContent s + + let gdec' = prGlyphDecoder p + p' <- loop (xobjectChildren xobj) s + (p {prGlyphDecoder = xobjectGlyphDecoder xobj}) + return (p' {prGlyphDecoder = gdec'}) + + p <- loop xobjects is $ mkProcessor { + prGlyphDecoder = glyphDecoder + } + return (List.reverse (prOperators p)) + combinedContent :: Pdf -> [Ref] -> IO (InputStream ByteString) combinedContent pdf refs = do allStreams <- forM refs $ \ref -> do From 17d2e3e1d2f697e0df21982f70b842811d02fee3 Mon Sep 17 00:00:00 2001 From: Alan Zimmerman Date: Tue, 13 Jun 2023 23:07:14 +0100 Subject: [PATCH 3/3] Export Pdf.Content.Ops.Object(..) --- content/lib/Pdf/Content/Ops.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/content/lib/Pdf/Content/Ops.hs b/content/lib/Pdf/Content/Ops.hs index e208a65..46b0cc9 100644 --- a/content/lib/Pdf/Content/Ops.hs +++ b/content/lib/Pdf/Content/Ops.hs @@ -7,6 +7,7 @@ module Pdf.Content.Ops Op(..), Expr(..), Operator, + Object(..), toOp ) where