Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ tmp/
cabal.sandbox.config
dist-newstyle/
.ghc.environment.*
/.envrc
/cabal.project.local
2 changes: 1 addition & 1 deletion cabal.project
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ packages:
core/
content/
document/
viewer/
-- viewer/
examples/
1 change: 1 addition & 0 deletions content/lib/Pdf/Content/Ops.hs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ module Pdf.Content.Ops
Op(..),
Expr(..),
Operator,
Object(..),
toOp
)
where
Expand Down
8 changes: 8 additions & 0 deletions content/lib/Pdf/Content/Parser.hs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{-# LANGUAGE OverloadedStrings #-}

-- | Parse content stream

Expand Down Expand Up @@ -43,8 +44,15 @@ parseContent = do
skipSpace
(Parser.endOfInput >> return Nothing) <|>
fmap Just (fmap Obj parseObject <|>
parseInlineImage <|>
fmap (Op . toOp) (Parser.takeWhile1 isRegularChar))

parseInlineImage :: Parser Expr
parseInlineImage = do
Parser.string "ID"
Parser.manyTill Parser.anyChar (Parser.string "EI")
Copy link
Copy Markdown
Owner

@Yuras Yuras Jun 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is very suspicious. What if image data contains EI?

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be interesting: mozilla/pdf.js#16461

Copy link
Copy Markdown
Author

@alanz alanz Jun 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, it worried me too. The best approach is to somehow derive the expected length of the image blob from the preamble information.

return $ Op Op_EI

-- Treat comments as spaces
skipSpace :: Parser ()
skipSpace = do
Expand Down
8 changes: 5 additions & 3 deletions content/lib/Pdf/Content/Processor.hs
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,16 @@ initialGraphicsState = GraphicsState {
data Span = Span
{ spGlyphs :: [Glyph]
, spFontName :: Name
}
} deriving Show

-- | Processor maintains graphics state
data Processor = Processor {
prState :: GraphicsState,
prStateStack :: [GraphicsState],
prGlyphDecoder :: GlyphDecoder,
prSpans :: [Span]
prSpans :: [Span],
-- ^ Each element is a list of glyphs, drawn in one shot
prOperators :: [Operator]
}

-- | Create processor in initial state
Expand All @@ -106,7 +107,8 @@ mkProcessor = Processor {
prState = initialGraphicsState,
prStateStack = [],
prGlyphDecoder = \_ _ -> [],
prSpans = mempty
prSpans = mempty,
prOperators = mempty
}

-- | Process one operation
Expand Down
51 changes: 50 additions & 1 deletion document/lib/Pdf/Document/Page.hs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ module Pdf.Document.Page
pageFontDicts,
pageExtractText,
pageExtractGlyphs,
glyphsToText
glyphsToText,
pageExtractOperators
)
where

Expand Down Expand Up @@ -249,6 +250,54 @@ pageExtractGlyphs page = do
}
return (List.reverse (prSpans p))

pageExtractOperators :: Page -> IO [Operator]
pageExtractOperators page = do
fontDicts <- Map.fromList <$> pageFontDicts page
glyphDecoders <- Traversable.forM fontDicts $ \fontDict ->
fontInfoDecodeGlyphs <$> fontDictLoadInfo fontDict
let glyphDecoder fontName = \str ->
case Map.lookup fontName glyphDecoders of
Nothing -> []
Just decode -> decode str

xobjects <- pageXObjects page

is <- do
contents <- pageContents page
let Page pdf _ _ = page
is <- combinedContent pdf contents
Streams.parserToInputStream parseContent is

-- use content stream processor to extract text
let loop xobjs s p = do
next <- readNextOperator s
case next of
Just (Op_Do, [Name name]) -> processDo xobjs name p >>= loop xobjs s
Just op -> do
let p' = p { prOperators = op : prOperators p }
case processOp op p' of
Left err -> throwIO (Unexpected err [])
Right p' -> loop xobjs s p'
Nothing -> return p

processDo xobjs name p = do
case Map.lookup name xobjs of
Nothing -> return p
Just xobj -> do
s <- do
s <- Streams.fromLazyByteString (xobjectContent xobj)
Streams.parserToInputStream parseContent s

let gdec' = prGlyphDecoder p
p' <- loop (xobjectChildren xobj) s
(p {prGlyphDecoder = xobjectGlyphDecoder xobj})
return (p' {prGlyphDecoder = gdec'})

p <- loop xobjects is $ mkProcessor {
prGlyphDecoder = glyphDecoder
}
return (List.reverse (prOperators p))

combinedContent :: Pdf -> [Ref] -> IO (InputStream ByteString)
combinedContent pdf refs = do
allStreams <- forM refs $ \ref -> do
Expand Down