diff --git a/.gitignore b/.gitignore index 99ad5fb..f5146a6 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ tmp/ cabal.sandbox.config dist-newstyle/ .ghc.environment.* +/.envrc +/cabal.project.local diff --git a/cabal.project b/cabal.project index cc0b481..eecd476 100644 --- a/cabal.project +++ b/cabal.project @@ -2,5 +2,5 @@ packages: core/ content/ document/ - viewer/ + -- viewer/ examples/ diff --git a/content/lib/Pdf/Content/Ops.hs b/content/lib/Pdf/Content/Ops.hs index e208a65..46b0cc9 100644 --- a/content/lib/Pdf/Content/Ops.hs +++ b/content/lib/Pdf/Content/Ops.hs @@ -7,6 +7,7 @@ module Pdf.Content.Ops Op(..), Expr(..), Operator, + Object(..), toOp ) where diff --git a/content/lib/Pdf/Content/Parser.hs b/content/lib/Pdf/Content/Parser.hs index f386491..c6403b8 100644 --- a/content/lib/Pdf/Content/Parser.hs +++ b/content/lib/Pdf/Content/Parser.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} -- | Parse content stream @@ -43,8 +44,15 @@ parseContent = do skipSpace (Parser.endOfInput >> return Nothing) <|> fmap Just (fmap Obj parseObject <|> + parseInlineImage <|> fmap (Op . toOp) (Parser.takeWhile1 isRegularChar)) +parseInlineImage :: Parser Expr +parseInlineImage = do + Parser.string "ID" + Parser.manyTill Parser.anyChar (Parser.string "EI") + return $ Op Op_EI + -- Treat comments as spaces skipSpace :: Parser () skipSpace = do diff --git a/content/lib/Pdf/Content/Processor.hs b/content/lib/Pdf/Content/Processor.hs index 76b433c..a73f484 100644 --- a/content/lib/Pdf/Content/Processor.hs +++ b/content/lib/Pdf/Content/Processor.hs @@ -89,15 +89,16 @@ initialGraphicsState = GraphicsState { data Span = Span { spGlyphs :: [Glyph] , spFontName :: Name - } + } deriving Show -- | Processor maintains graphics state data Processor = Processor { prState :: GraphicsState, prStateStack :: [GraphicsState], prGlyphDecoder :: GlyphDecoder, - prSpans :: [Span] + prSpans :: [Span], -- ^ Each element is a list of glyphs, drawn in one shot + prOperators :: [Operator] } -- | Create processor in initial state @@ -106,7 +107,8 @@ mkProcessor = Processor { prState = initialGraphicsState, prStateStack = [], prGlyphDecoder = \_ _ -> [], - prSpans = mempty + prSpans = mempty, + prOperators = mempty } -- | Process one operation diff --git a/document/lib/Pdf/Document/Page.hs b/document/lib/Pdf/Document/Page.hs index 01bc39b..5c2abc8 100644 --- a/document/lib/Pdf/Document/Page.hs +++ b/document/lib/Pdf/Document/Page.hs @@ -11,7 +11,8 @@ module Pdf.Document.Page pageFontDicts, pageExtractText, pageExtractGlyphs, - glyphsToText + glyphsToText, + pageExtractOperators ) where @@ -249,6 +250,54 @@ pageExtractGlyphs page = do } return (List.reverse (prSpans p)) +pageExtractOperators :: Page -> IO [Operator] +pageExtractOperators page = do + fontDicts <- Map.fromList <$> pageFontDicts page + glyphDecoders <- Traversable.forM fontDicts $ \fontDict -> + fontInfoDecodeGlyphs <$> fontDictLoadInfo fontDict + let glyphDecoder fontName = \str -> + case Map.lookup fontName glyphDecoders of + Nothing -> [] + Just decode -> decode str + + xobjects <- pageXObjects page + + is <- do + contents <- pageContents page + let Page pdf _ _ = page + is <- combinedContent pdf contents + Streams.parserToInputStream parseContent is + + -- use content stream processor to extract text + let loop xobjs s p = do + next <- readNextOperator s + case next of + Just (Op_Do, [Name name]) -> processDo xobjs name p >>= loop xobjs s + Just op -> do + let p' = p { prOperators = op : prOperators p } + case processOp op p' of + Left err -> throwIO (Unexpected err []) + Right p' -> loop xobjs s p' + Nothing -> return p + + processDo xobjs name p = do + case Map.lookup name xobjs of + Nothing -> return p + Just xobj -> do + s <- do + s <- Streams.fromLazyByteString (xobjectContent xobj) + Streams.parserToInputStream parseContent s + + let gdec' = prGlyphDecoder p + p' <- loop (xobjectChildren xobj) s + (p {prGlyphDecoder = xobjectGlyphDecoder xobj}) + return (p' {prGlyphDecoder = gdec'}) + + p <- loop xobjects is $ mkProcessor { + prGlyphDecoder = glyphDecoder + } + return (List.reverse (prOperators p)) + combinedContent :: Pdf -> [Ref] -> IO (InputStream ByteString) combinedContent pdf refs = do allStreams <- forM refs $ \ref -> do