From 1f3802f6855e74642276e7ef5a8a80e86c6c8119 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 28 Apr 2026 13:34:19 +0000 Subject: [PATCH 1/3] feat: add 3 new Lua filters and fix citep link-text bug Agent-Logs-Url: https://github.com/ivoa/IvoaDocViewSite/sessions/e5ca5916-94f8-4d50-8a18-3df364c75125 Co-authored-by: pahjbo <273267+pahjbo@users.noreply.github.com> --- Makefile | 3 + .../drop-empty-inline-shells.lua | 54 +++++++++ .../fix-figure-media-links.lua | 45 ++++++++ pandocCustomization/relink-ivoa-citations.lua | 10 +- pandocCustomization/sanitize-raw-inline.lua | 107 ++++++++++++++++++ 5 files changed, 211 insertions(+), 8 deletions(-) create mode 100644 pandocCustomization/drop-empty-inline-shells.lua create mode 100644 pandocCustomization/fix-figure-media-links.lua create mode 100644 pandocCustomization/sanitize-raw-inline.lua diff --git a/Makefile b/Makefile index 981eeed..8430e6b 100644 --- a/Makefile +++ b/Makefile @@ -45,6 +45,9 @@ $$(SPHINXDIR)/$(1)/$(1).rst : $$(SRCDIR)/$(1)/$(1).tex pandocCustomization/lates --lua-filter=$$(PANDCUST)/relink-ivoa-citations.lua \ --lua-filter=$$(PANDCUST)/autolink-docnames.lua \ --lua-filter=$$(PANDCUST)/fix_internal_refs.lua \ + --lua-filter=$$(PANDCUST)/sanitize-raw-inline.lua \ + --lua-filter=$$(PANDCUST)/fix-figure-media-links.lua \ + --lua-filter=$$(PANDCUST)/drop-empty-inline-shells.lua \ --lua-filter=$$(PANDCUST)/number-sections.lua --template=$$(PANDCUST)/default.rst\ > $$(ROOTDIR)/$$@ make -C $$(dir $$<) -f $$(ROOTDIR)/util.mk -f Makefile copyRequiredFiles TODIR=$$(ROOTDIR)/$$(dir $$@) diff --git a/pandocCustomization/drop-empty-inline-shells.lua b/pandocCustomization/drop-empty-inline-shells.lua new file mode 100644 index 0000000..130969b --- /dev/null +++ b/pandocCustomization/drop-empty-inline-shells.lua @@ -0,0 +1,54 @@ +-- drop-empty-inline-shells.lua +-- Remove inline container elements that carry no visible content. +-- +-- Empty containers arise from LaTeX constructs such as \emph{}, \textbf{}, +-- or empty groupings left over from macro expansion. They produce stray +-- whitespace or orphan punctuation in the rendered HTML output. +-- +-- An element is treated as empty when its content list contains only Space, +-- SoftBreak, LineBreak, or whitespace-only Str nodes. +-- +-- Handled element types: +-- Span, Emph, Strong, Strikeout, Superscript, Subscript, SmallCaps +-- +-- Placement in the filter chain: +-- Run after fix_internal_refs.lua and before number-sections.lua. + +local function is_whitespace_only(inlines) + -- Note: %s in Lua patterns matches ASCII whitespace only (space, tab, + -- newline, etc.). Non-breaking spaces (U+00A0) and other Unicode + -- whitespace are not caught; this is acceptable for IVOA standards content + -- which is predominantly ASCII. + for _, el in ipairs(inlines) do + local t = el.tag + if t == "Space" or t == "SoftBreak" or t == "LineBreak" then + -- acceptable whitespace node + elseif t == "Str" then + if not el.text:match("^%s*$") then + return false + end + else + return false + end + end + return true +end + +local function drop_if_empty(el) + if not el.content then return end + if #el.content == 0 or is_whitespace_only(el.content) then + return {} + end +end + +return { + { + Span = drop_if_empty, + Emph = drop_if_empty, + Strong = drop_if_empty, + Strikeout = drop_if_empty, + Superscript = drop_if_empty, + Subscript = drop_if_empty, + SmallCaps = drop_if_empty, + } +} diff --git a/pandocCustomization/fix-figure-media-links.lua b/pandocCustomization/fix-figure-media-links.lua new file mode 100644 index 0000000..141c995 --- /dev/null +++ b/pandocCustomization/fix-figure-media-links.lua @@ -0,0 +1,45 @@ +-- fix-figure-media-links.lua +-- Replace Image elements whose source file is a PDF with an RST hyperlink, +-- since browsers cannot render PDFs as inline images. +-- +-- Policy: PDF image src -> RST anonymous reference link to the PDF file. +-- The image alt-text (or the bare filename when alt is empty) is used as the +-- visible link label. +-- +-- Handles both: +-- * Inline images inside Para elements +-- * Block images inside Figure elements (the Image node is reached in both +-- cases because Pandoc walks all Image elements regardless of context) +-- +-- Placement in the filter chain: +-- Run after fix_internal_refs.lua and before number-sections.lua. + +local function is_pdf(src) + return src:match("%.[Pp][Dd][Ff]$") ~= nil +end + +local function basename(path) + return path:match("([^/\\]+)$") or path +end + +-- Escape backticks in RST link label text to avoid breaking the reference syntax. +local function escape_rst_label(s) + return s:gsub("`", "\\`") +end + +function Image(el) + if not is_pdf(el.src) then return end + + local alt = pandoc.utils.stringify(el.caption) + if alt == "" then + alt = basename(el.src) + end + + -- Emit an RST anonymous reference so the PDF is reachable as a link. + local rst = "`" .. escape_rst_label(alt) .. " <" .. el.src .. ">`__" + return pandoc.RawInline("rst", rst) +end + +return { + { Image = Image } +} diff --git a/pandocCustomization/relink-ivoa-citations.lua b/pandocCustomization/relink-ivoa-citations.lua index 470f7a9..628e610 100644 --- a/pandocCustomization/relink-ivoa-citations.lua +++ b/pandocCustomization/relink-ivoa-citations.lua @@ -42,14 +42,8 @@ function Cite(c) outstring = ":cite:`" .. doc_to_bib[std_docname] .. "`" end elseif bibkey and bibmap[bibkey] then - -- TODO - is this the best way to present? - if(v.text:find("citep")) then - local citetext = "ref" - outstring = ":doc:`".. citetext .." <../" .. bibmap[bibkey] .. "/" .. bibmap[bibkey] ..">`" - else - local citetext = bibmap[bibkey] - outstring = ":doc:`".. citetext .." <../" .. bibmap[bibkey] .. "/" .. bibmap[bibkey] ..">`" - end + local citetext = bibmap[bibkey] + outstring = ":doc:`".. citetext .." <../" .. bibmap[bibkey] .. "/" .. bibmap[bibkey] ..">`" else outstring = v.text:gsub("\\cite([pt]){([^}]+)}",":cite:%1:`%2`") diff --git a/pandocCustomization/sanitize-raw-inline.lua b/pandocCustomization/sanitize-raw-inline.lua new file mode 100644 index 0000000..99e8b6c --- /dev/null +++ b/pandocCustomization/sanitize-raw-inline.lua @@ -0,0 +1,107 @@ +-- sanitize-raw-inline.lua +-- Strip or convert residual LaTeX control sequences that survive pandoc's +-- latex+raw_tex pass as RawInline("latex", ...) nodes. +-- +-- Placement in the filter chain: +-- Run AFTER fix_internal_refs.lua so that \ref{} and \label{} have already +-- been converted to RST and will not be seen here. +-- +-- Policy: +-- * Whitespace/layout commands -> removed (return {}) +-- * Known text-expansion macros -> literal text +-- * \index{...}, \phantom{...} -> removed +-- * \hspace{...}, \vspace{...} -> removed +-- * \url{...} -> RST anonymous hyperlink +-- * \href{url}{text} -> RST anonymous hyperlink +-- * Anything else -> left unchanged (preserve unknown macros) + +-- Commands that produce no visible output and should be silently dropped. +local DROP_COMMANDS = { + ["\\noindent"] = true, + ["\\par"] = true, + ["\\clearpage"] = true, + ["\\newpage"] = true, + ["\\linebreak"] = true, + ["\\hfill"] = true, + ["\\vfill"] = true, + ["\\newline"] = true, + ["\\centering"] = true, + ["\\raggedright"]= true, + ["\\raggedleft"] = true, + ["\\null"] = true, + ["\\relax"] = true, + ["\\-"] = true, +} + +-- Commands that expand to a fixed literal string. +local EXPAND_COMMANDS = { + ["\\TeX"] = "TeX", + ["\\LaTeX"] = "LaTeX", + ["\\BibTeX"] = "BibTeX", + ["\\textbackslash"]= "\\", + ["\\ldots"] = "\226\128\166", -- U+2026 HORIZONTAL ELLIPSIS + ["\\dots"] = "\226\128\166", -- U+2026 HORIZONTAL ELLIPSIS + ["\\lq"] = "\226\128\152", -- U+2018 LEFT SINGLE QUOTATION MARK + ["\\rq"] = "\226\128\153", -- U+2019 RIGHT SINGLE QUOTATION MARK +} + +function RawInline(el) + if el.format ~= "latex" then return end + local text = el.text + + -- 1. Drop pure layout/whitespace macros (bare command, optional trailing *) + local cmd = text:match("^(\\%a+)%*?%s*$") or text:match("^(\\%-)$") + if cmd and DROP_COMMANDS[cmd] then + return {} + end + + -- 2. Text-expansion macros (bare, possibly followed by {} or whitespace) + -- The third pattern intentionally omits an end-anchor so that a macro + -- immediately followed by punctuation (e.g. "\LaTeX.") is still matched; + -- the suffix is preserved and appended to the expansion below. + local bare = text:match("^(\\%a+)%s*%{%}$") + or text:match("^(\\%a+)%s*$") + or text:match("^(\\%a+)[%s%p]") + if bare and EXPAND_COMMANDS[bare] then + local suffix = text:sub(#bare + 1) + local expansion = EXPAND_COMMANDS[bare] + -- Drop trailing {} or whitespace (TeX swallows a space after control words) + if suffix == "" or suffix:match("^%{%}") or suffix:match("^%s") then + return pandoc.Str(expansion) + end + return pandoc.Str(expansion .. suffix) + end + + -- 3. \index{...} and \phantom{...} – drop silently (no visible output) + if text:match("^\\index%s*%{") or text:match("^\\phantom%s*%{") then + return {} + end + + -- 4. \hspace{...} / \vspace{...} (with or without *) + if text:match("^\\[hv]space%*?%s*%{") then + return {} + end + + -- 5. \url{...} -> RST anonymous hyperlink + -- Note: [^}]+ does not handle percent-encoded closing braces (%7D); this + -- is an accepted limitation since IVOA standard sources use plain URLs. + local url = text:match("^\\url%s*%{([^}]+)%}$") + if url then + return pandoc.RawInline("rst", "`" .. url .. " <" .. url .. ">`__") + end + + -- 6. \href{url}{text} -> RST anonymous hyperlink + -- Note: [^}]+ does not handle nested braces; accepted limitation for + -- standard IVOA source conventions. + local href_url, href_text = text:match("^\\href%s*%{([^}]+)%}%s*%{([^}]*)%}$") + if href_url then + if href_text == "" then href_text = href_url end + return pandoc.RawInline("rst", "`" .. href_text .. " <" .. href_url .. ">`__") + end + + -- Everything else: leave unchanged so unknown macros are preserved +end + +return { + { RawInline = RawInline } +} From 608d11c84d61f79769489b37d27bf101255115b2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 28 Apr 2026 13:37:16 +0000 Subject: [PATCH 2/3] refactor: address code review feedback on lua filters Agent-Logs-Url: https://github.com/ivoa/IvoaDocViewSite/sessions/e5ca5916-94f8-4d50-8a18-3df364c75125 Co-authored-by: pahjbo <273267+pahjbo@users.noreply.github.com> --- .../drop-empty-inline-shells.lua | 8 +++++--- pandocCustomization/fix-figure-media-links.lua | 7 ++++++- pandocCustomization/sanitize-raw-inline.lua | 18 ++++++++++-------- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/pandocCustomization/drop-empty-inline-shells.lua b/pandocCustomization/drop-empty-inline-shells.lua index 130969b..e1f64d7 100644 --- a/pandocCustomization/drop-empty-inline-shells.lua +++ b/pandocCustomization/drop-empty-inline-shells.lua @@ -16,9 +16,11 @@ local function is_whitespace_only(inlines) -- Note: %s in Lua patterns matches ASCII whitespace only (space, tab, - -- newline, etc.). Non-breaking spaces (U+00A0) and other Unicode - -- whitespace are not caught; this is acceptable for IVOA standards content - -- which is predominantly ASCII. + -- newline, etc.). Unicode whitespace (e.g., U+00A0 non-breaking space) + -- will not be treated as empty, so a Span containing only a non-breaking + -- space will be kept rather than dropped. This is acceptable for current + -- IVOA standards content; add a UTF-8 check here if Unicode-only whitespace + -- containers become a problem in the future. for _, el in ipairs(inlines) do local t = el.tag if t == "Space" or t == "SoftBreak" or t == "LineBreak" then diff --git a/pandocCustomization/fix-figure-media-links.lua b/pandocCustomization/fix-figure-media-links.lua index 141c995..d6c8fa9 100644 --- a/pandocCustomization/fix-figure-media-links.lua +++ b/pandocCustomization/fix-figure-media-links.lua @@ -27,6 +27,11 @@ local function escape_rst_label(s) return s:gsub("`", "\\`") end +-- Escape characters in a URL that would break RST angle-bracket delimiters. +local function escape_rst_url(s) + return s:gsub(">", "%%3E") +end + function Image(el) if not is_pdf(el.src) then return end @@ -36,7 +41,7 @@ function Image(el) end -- Emit an RST anonymous reference so the PDF is reachable as a link. - local rst = "`" .. escape_rst_label(alt) .. " <" .. el.src .. ">`__" + local rst = "`" .. escape_rst_label(alt) .. " <" .. escape_rst_url(el.src) .. ">`__" return pandoc.RawInline("rst", rst) end diff --git a/pandocCustomization/sanitize-raw-inline.lua b/pandocCustomization/sanitize-raw-inline.lua index 99e8b6c..2e30cc1 100644 --- a/pandocCustomization/sanitize-raw-inline.lua +++ b/pandocCustomization/sanitize-raw-inline.lua @@ -39,10 +39,10 @@ local EXPAND_COMMANDS = { ["\\LaTeX"] = "LaTeX", ["\\BibTeX"] = "BibTeX", ["\\textbackslash"]= "\\", - ["\\ldots"] = "\226\128\166", -- U+2026 HORIZONTAL ELLIPSIS - ["\\dots"] = "\226\128\166", -- U+2026 HORIZONTAL ELLIPSIS - ["\\lq"] = "\226\128\152", -- U+2018 LEFT SINGLE QUOTATION MARK - ["\\rq"] = "\226\128\153", -- U+2019 RIGHT SINGLE QUOTATION MARK + ["\\ldots"] = "…", -- U+2026 HORIZONTAL ELLIPSIS + ["\\dots"] = "…", -- U+2026 HORIZONTAL ELLIPSIS + ["\\lq"] = "\xe2\x80\x98", -- U+2018 LEFT SINGLE QUOTATION MARK (') + ["\\rq"] = "\xe2\x80\x99", -- U+2019 RIGHT SINGLE QUOTATION MARK (') } function RawInline(el) @@ -83,16 +83,18 @@ function RawInline(el) end -- 5. \url{...} -> RST anonymous hyperlink - -- Note: [^}]+ does not handle percent-encoded closing braces (%7D); this - -- is an accepted limitation since IVOA standard sources use plain URLs. + -- Note: [^}]+ does not handle braces inside the URL (encoded as %7B/%7D + -- or unencoded). IVOA standard sources use plain URLs without braces, + -- so this is an accepted limitation. local url = text:match("^\\url%s*%{([^}]+)%}$") if url then return pandoc.RawInline("rst", "`" .. url .. " <" .. url .. ">`__") end -- 6. \href{url}{text} -> RST anonymous hyperlink - -- Note: [^}]+ does not handle nested braces; accepted limitation for - -- standard IVOA source conventions. + -- Note: [^}]+ / [^}]* do not handle braces in either the URL or the link + -- text. IVOA standard sources do not use braces in \href arguments, so + -- this is an accepted limitation. local href_url, href_text = text:match("^\\href%s*%{([^}]+)%}%s*%{([^}]*)%}$") if href_url then if href_text == "" then href_text = href_url end From 11b92dc1e62a6924cb8e50c0946d95315c2edd56 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 28 Apr 2026 15:59:57 +0000 Subject: [PATCH 3/3] feat: add drop-conformance-section.lua filter Agent-Logs-Url: https://github.com/ivoa/IvoaDocViewSite/sessions/c9f94af1-154f-4960-9fc4-e2f923d3d55e Co-authored-by: pahjbo <273267+pahjbo@users.noreply.github.com> --- Makefile | 1 + .../drop-conformance-section.lua | 55 +++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 pandocCustomization/drop-conformance-section.lua diff --git a/Makefile b/Makefile index 8430e6b..1ef9829 100644 --- a/Makefile +++ b/Makefile @@ -48,6 +48,7 @@ $$(SPHINXDIR)/$(1)/$(1).rst : $$(SRCDIR)/$(1)/$(1).tex pandocCustomization/lates --lua-filter=$$(PANDCUST)/sanitize-raw-inline.lua \ --lua-filter=$$(PANDCUST)/fix-figure-media-links.lua \ --lua-filter=$$(PANDCUST)/drop-empty-inline-shells.lua \ + --lua-filter=$$(PANDCUST)/drop-conformance-section.lua \ --lua-filter=$$(PANDCUST)/number-sections.lua --template=$$(PANDCUST)/default.rst\ > $$(ROOTDIR)/$$@ make -C $$(dir $$<) -f $$(ROOTDIR)/util.mk -f Makefile copyRequiredFiles TODIR=$$(ROOTDIR)/$$(dir $$@) diff --git a/pandocCustomization/drop-conformance-section.lua b/pandocCustomization/drop-conformance-section.lua new file mode 100644 index 0000000..34f3136 --- /dev/null +++ b/pandocCustomization/drop-conformance-section.lua @@ -0,0 +1,55 @@ +-- drop-conformance-section.lua +-- Remove the "Conformance-related definitions" boilerplate section that +-- appears in every IVOA standard. This section is standard ivoatex +-- scaffolding (RFC 2119 keyword definitions) and is not useful as rendered +-- per-document prose on the multi-document site. +-- +-- The section header is identified by a case-insensitive match on the words +-- "conformance" and "definition" appearing together in the header text. +-- All blocks that belong to the matched section (from its header up to, but +-- not including, the next header at the same or higher level) are dropped. +-- +-- Placement in the filter chain: +-- Run before number-sections.lua so the header text has not yet been +-- prefixed with a section number. + +local function is_conformance_header(header) + local text = pandoc.utils.stringify(header.content):lower() + return text:find("conformance") and text:find("definition") +end + +function Pandoc(doc) + local new_blocks = {} + local skip = false + local skip_level = nil + + for _, block in ipairs(doc.blocks) do + if block.tag == "Header" then + if not skip and is_conformance_header(block) then + -- Start skipping: record the section level so we know when + -- a sibling or ancestor header ends the skipped region. + skip = true + skip_level = block.level + elseif skip and block.level <= skip_level then + -- A header at the same or higher level (numerically <=) ends + -- the conformance section; stop skipping and keep this block. + skip = false + skip_level = nil + new_blocks[#new_blocks + 1] = block + elseif not skip then + new_blocks[#new_blocks + 1] = block + end + -- (if skip and block.level > skip_level: a sub-section inside + -- the conformance section – keep skipping, do nothing) + elseif not skip then + new_blocks[#new_blocks + 1] = block + end + end + + doc.blocks = pandoc.Blocks(new_blocks) + return doc +end + +return { + { Pandoc = Pandoc } +}