Yuras · alanz · Jun 7, 2023 · Jun 7, 2023 · Jun 13, 2023 · Yuras
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@ tmp/
 cabal.sandbox.config
 dist-newstyle/
 .ghc.environment.*
+/.envrc
+/cabal.project.local
diff --git a/cabal.project b/cabal.project
@@ -2,5 +2,5 @@ packages:
   core/
   content/
   document/
-  viewer/
+  -- viewer/
   examples/
diff --git a/content/lib/Pdf/Content/Ops.hs b/content/lib/Pdf/Content/Ops.hs
@@ -7,6 +7,7 @@ module Pdf.Content.Ops
   Op(..),
   Expr(..),
   Operator,
+  Object(..),
   toOp
 )
 where

diff --git a/content/lib/Pdf/Content/Parser.hs b/content/lib/Pdf/Content/Parser.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE OverloadedStrings #-}
 
 -- | Parse content stream
 
@@ -43,8 +44,15 @@ parseContent = do
   skipSpace
   (Parser.endOfInput >> return Nothing) <|>
     fmap Just (fmap Obj parseObject <|>
+              parseInlineImage <|>
                fmap (Op . toOp) (Parser.takeWhile1 isRegularChar))
 
+parseInlineImage :: Parser Expr
+parseInlineImage = do
+  Parser.string "ID"
+  Parser.manyTill Parser.anyChar (Parser.string "EI")
+  return $ Op Op_EI
+
 -- Treat comments as spaces
 skipSpace :: Parser ()
 skipSpace = do

diff --git a/content/lib/Pdf/Content/Processor.hs b/content/lib/Pdf/Content/Processor.hs
@@ -89,15 +89,16 @@ initialGraphicsState = GraphicsState {
 data Span = Span
   { spGlyphs :: [Glyph]
   , spFontName :: Name
-  }
+  } deriving Show
 
 -- | Processor maintains graphics state
 data Processor = Processor {
   prState :: GraphicsState,
   prStateStack :: [GraphicsState],
   prGlyphDecoder :: GlyphDecoder,
-  prSpans :: [Span]
+  prSpans :: [Span],
   -- ^ Each element is a list of glyphs, drawn in one shot
+  prOperators :: [Operator]
   }
 
 -- | Create processor in initial state
@@ -106,7 +107,8 @@ mkProcessor = Processor {
   prState = initialGraphicsState,
   prStateStack = [],
   prGlyphDecoder = \_ _ -> [],
-  prSpans = mempty
+  prSpans = mempty,
+  prOperators = mempty
   }
 
 -- | Process one operation

diff --git a/document/lib/Pdf/Document/Page.hs b/document/lib/Pdf/Document/Page.hs
@@ -11,7 +11,8 @@ module Pdf.Document.Page
   pageFontDicts,
   pageExtractText,
   pageExtractGlyphs,
-  glyphsToText
+  glyphsToText,
+  pageExtractOperators
 )
 where
 
@@ -249,6 +250,54 @@ pageExtractGlyphs page = do
     }
   return (List.reverse (prSpans p))
 
+pageExtractOperators :: Page -> IO [Operator]
+pageExtractOperators page = do
+  fontDicts <- Map.fromList <$> pageFontDicts page
+  glyphDecoders <- Traversable.forM fontDicts $ \fontDict ->
+    fontInfoDecodeGlyphs <$> fontDictLoadInfo fontDict
+  let glyphDecoder fontName = \str ->
+        case Map.lookup fontName glyphDecoders of
+          Nothing -> []
+          Just decode -> decode str
+
+  xobjects <- pageXObjects page
+
+  is <- do
+    contents <- pageContents page
+    let Page pdf _ _ = page
+    is <- combinedContent pdf contents
+    Streams.parserToInputStream parseContent is
+
+  -- use content stream processor to extract text
+  let loop xobjs s p = do
+        next <- readNextOperator s
+        case next of
+          Just (Op_Do, [Name name]) -> processDo xobjs name p >>= loop xobjs s
+          Just op -> do
+            let p' = p { prOperators = op : prOperators p }
+            case processOp op p' of
+              Left err -> throwIO (Unexpected err [])
+              Right  p' -> loop xobjs s p'
+          Nothing -> return p
+
+      processDo xobjs name p = do
+        case Map.lookup name xobjs of
+          Nothing -> return p
+          Just xobj -> do
+            s <- do
+              s <- Streams.fromLazyByteString (xobjectContent xobj)
+              Streams.parserToInputStream parseContent s
+
+            let gdec' = prGlyphDecoder p
+            p' <- loop (xobjectChildren xobj) s
+              (p {prGlyphDecoder = xobjectGlyphDecoder xobj})
+            return (p' {prGlyphDecoder = gdec'})
+
+  p <- loop xobjects is $ mkProcessor {
+    prGlyphDecoder = glyphDecoder
+    }
+  return (List.reverse (prOperators p))
+
 combinedContent :: Pdf -> [Ref] -> IO (InputStream ByteString)
 combinedContent pdf refs = do
   allStreams <- forM refs $ \ref -> do
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ module Pdf.Content.Ops @@
       Op(..),
       Expr(..),
       Operator,
+      Object(..),
       toOp
     )
     where
@@ Expand Down @@