|
| 1 | +# -*- coding: utf-8 -*- # |
| 2 | +# frozen_string_literal: true |
| 3 | +# vim: set ts=2 sw=2 et: |
| 4 | + |
| 5 | +# PDF = Portable Document Format page description language |
| 6 | +# As defined by ISO 32000-2:2020 including resolved errata from https://pdf-issues.pdfa.org/ |
| 7 | +# |
| 8 | +# The PDF syntax is also known as "COS" and can be used with FDF (Forms Data Field) files as |
| 9 | +# per ISO 32000-2:2020 clause 12.7.8. |
| 10 | +# |
| 11 | +# This is a token-based parser ONLY! It is intended to syntax highlight full or partial fragments |
| 12 | +# of nicely written hand-written PDF syntax in documentation such as ISO specifications. It is NOT |
| 13 | +# intended to cope with real-world PDFs that will contain arbitrary binary data (that form invalid |
| 14 | +# UTF-8 sequences and generate "ArgumentError: invalid byte sequence in UTF-8" Ruby errors) and |
| 15 | +# other types of malformations or syntax errors. |
| 16 | +# |
| 17 | +# Author: Peter Wyatt, CTO, PDF Association. 2024 |
| 18 | +# |
| 19 | +module Rouge |
| 20 | + module Lexers |
| 21 | + class Pdf < RegexLexer |
| 22 | + title "PDF" |
| 23 | + desc "PDF - Portable Document Format (ISO 32000)" |
| 24 | + tag 'pdf' |
| 25 | + aliases "fdf", 'cos' |
| 26 | + filenames '*.pdf', '*.fdf' |
| 27 | + mimetypes 'application/pdf', 'application/fdf' # IANA registered media types |
| 28 | + |
| 29 | + # PDF and FDF files must start with "%PDF-x.y" or "%FDF-x.y" |
| 30 | + # where x is the single digit major version and y is the single digit minor version. |
| 31 | + # For simplicity as a syntax highlighter, assumes occurs at start of a line. |
| 32 | + def self.detect?(text) |
| 33 | + return true if /\A%(P|F)DF-\d\.\d/.match?(text) |
| 34 | + end |
| 35 | + |
| 36 | + # PDF Delimiters (ISO 32000-2:2020, Table 1 and Table 2). |
| 37 | + # Ruby whitespace "\s" is /[ \t\r\n\f\v]/ which does not include NUL (ISO 32000-2:2020, Table 1). |
| 38 | + # PDF also support 2 character EOL sequences. |
| 39 | + |
| 40 | + state :root do |
| 41 | + # Start-of-file header comment is special (comment is up to EOL) |
| 42 | + rule %r/^%(P|F)DF-\d\.\d.*$/, Comment::Preproc |
| 43 | + |
| 44 | + # End-of-file marker comment is special (comment is up to EOL) |
| 45 | + rule %r/^%%EOF.*$/, Comment::Preproc |
| 46 | + |
| 47 | + # PDF only has single-line comments: from "%" to EOL |
| 48 | + rule %r/%.*$/, Comment::Single |
| 49 | + |
| 50 | + # PDF Boolean and null object keywords |
| 51 | + rule %r/(false|true|null)/, Keyword::Constant |
| 52 | + |
| 53 | + # PDF Dictionary and array object start and end tokens |
| 54 | + rule %r/(<<|>>|\[|\])/, Punctuation |
| 55 | + |
| 56 | + # PDF Hex string - can contain whitespace and span multiple lines. |
| 57 | + # This rule must be after "<<"/">>" |
| 58 | + rule %r/<[0-9A-Fa-f\s]*>/m, Str::Other |
| 59 | + |
| 60 | + # PDF literal strings are complex (multi-line, escapes, etc.). Use separate state machine. |
| 61 | + rule %r/\(/, Str, :stringliteral |
| 62 | + |
| 63 | + # PDF Name objects - can be empty (i.e., nothing after "/"). |
| 64 | + # No special processing required for 2-digit hex codes that start with "#". |
| 65 | + rule %r/\/[^\(\)<>\[\]\/%\s]*/, Name::Other |
| 66 | + |
| 67 | + # PDF objects and stream (no checking of object ID) |
| 68 | + # Note that object number and generation numbers do not have sign. |
| 69 | + rule %r/\d+\s\d+\sobj/, Keyword::Declaration |
| 70 | + rule %r/endstream|endobj|stream/, Keyword::Declaration |
| 71 | + |
| 72 | + # PDF conventional file layout keywords |
| 73 | + rule %r/startxref|trailer|xref/, Keyword::Declaration |
| 74 | + |
| 75 | + # PDF cross reference section entries (20 bytes including EOL). |
| 76 | + # Explicit single SPACE separators. |
| 77 | + rule %r/^\d{10} \d{5} (n|f)\s*$/, Keyword::Namespace |
| 78 | + |
| 79 | + # PDF Indirect reference (lax, allows zero as the object number). |
| 80 | + # Requires terminating delimiter lookahead to disambiguate from "RG" operator |
| 81 | + rule %r/\d+\s\d+\sR(?=[\(\)<>\[\]\/%\s])/, Name::Decorator |
| 82 | + |
| 83 | + # PDF Real object |
| 84 | + rule %r/(\-|\+)?([0-9]+\.?|[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)/, Num::Float |
| 85 | + |
| 86 | + # PDF Integer object |
| 87 | + rule %r/(\-|\+)?[0-9]+/, Num::Integer |
| 88 | + |
| 89 | + # A run of non-delimiters is most likely a PDF content stream |
| 90 | + # operator (ISO 32000-2:2020, Annex A). |
| 91 | + rule %r/[^\(\)<>\[\]\/%\s]+/, Operator::Word |
| 92 | + |
| 93 | + # Whitespace (except inside strings and comments) is ignored = /[ \t\r\n\f\v]/. |
| 94 | + # Ruby doesn't include NUL as whitespace (vs ISO 32000-2:2020 Table 1) |
| 95 | + rule %r/\s+/, Text::Whitespace |
| 96 | + end |
| 97 | + |
| 98 | + # PDF literal string. See ISO 32000-2:2020 clause 7.3.4.2 and Table 3 |
| 99 | + state :stringliteral do |
| 100 | + rule %r/\(/, Str, :stringliteral # recursive for internal bracketed strings |
| 101 | + rule %r/\\\(/, Str::Escape, :stringliteral # recursive for internal escaped bracketed strings |
| 102 | + rule %r/\)/, Str, :pop! |
| 103 | + rule %r/\\\)/, Str::Escape, :pop! |
| 104 | + rule %r/\\([0-7]{3}|n|r|t|b|f|\\)/, Str::Escape |
| 105 | + rule %r/[^\(\)\\]+/, Str |
| 106 | + end |
| 107 | + end |
| 108 | + end |
| 109 | +end |
0 commit comments