From f33a11e10f1206256612868dead62aab130d610b Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Wed, 31 Jul 2024 15:17:57 +0100 Subject: [PATCH 01/20] Linaro changes for JSON files consumed with React --- README.rst | 23 ++------ pyproject.toml | 2 +- sphinxcontrib/serializinghtml/__init__.py | 20 +++++-- .../serializinghtml/nav_html_to_json.py | 54 +++++++++++++++++++ 4 files changed, 75 insertions(+), 24 deletions(-) create mode 100644 sphinxcontrib/serializinghtml/nav_html_to_json.py diff --git a/README.rst b/README.rst index d9ed40a..78eb63a 100644 --- a/README.rst +++ b/README.rst @@ -1,22 +1,5 @@ -============================= -sphinxcontrib-serializinghtml -============================= +This is a fork of https://github.com/sphinx-doc/sphinxcontrib-serializinghtml -sphinxcontrib-serializinghtml is a sphinx extension which outputs -"serialized" HTML files (json and pickle). +Changes made to this fork are to facilitate the creation of JSON files suitable for consumption by React. -For more details, please visit http://www.sphinx-doc.org/. - -Installing -========== - -Install from PyPI:: - - pip install -U sphinxcontrib-serializinghtml - -Contributing -============ - -See `CONTRIBUTING.rst`__ - -.. __: https://github.com/sphinx-doc/sphinx/blob/master/CONTRIBUTING.rst +Since those changes are very specific, they have not been contributed back to the original repo. diff --git a/pyproject.toml b/pyproject.toml index f14054e..a8a4329 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ classifiers = [ "Topic :: Text Processing", "Topic :: Utilities", ] -dependencies = [] +dependencies = ["beautifulsoup4"] dynamic = ["version"] [project.optional-dependencies] diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index bdbeb6f..d6e83ab 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -11,7 +11,7 @@ from sphinx.locale import get_translation from sphinx.util.osutil import SEP, copyfile, ensuredir, os_path -from sphinxcontrib.serializinghtml import jsonimpl +from sphinxcontrib.serializinghtml import jsonimpl, nav_html_to_json if TYPE_CHECKING: from collections.abc import Sequence @@ -91,9 +91,23 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p ctx.setdefault('pathto', lambda p: p) self.add_sidebars(pagename, ctx) + # Add the toc tree as a JSON dictionary + ctx['toctree'] = nav_html_to_json.convert_nav_html_to_json(self._get_local_toctree(pagename)) + if not outfilename: + # PJC: Ensure that index files are actually written under the name of the + # directory leafname. + parts = pagename.split(SEP) + if parts[len(parts)-1] == "index": + if len(parts) == 1: + # Use the project name + page_filename = self.get_builder_config('project_name', 'html') + else: + page_filename = SEP.join(parts[:-1]) + else: + page_filename = pagename outfilename = path.join(self.outdir, - os_path(pagename) + self.out_suffix) + os_path(page_filename) + self.out_suffix) # we're not taking the return value here, since no template is # actually rendered @@ -161,7 +175,7 @@ class JSONHTMLBuilder(SerializingHTMLBuilder): implementation_dumps_unicode = True indexer_format = jsonimpl indexer_dumps_unicode = True - out_suffix = '.fjson' + out_suffix = '.json' globalcontext_filename = 'globalcontext.json' searchindex_filename = 'searchindex.json' diff --git a/sphinxcontrib/serializinghtml/nav_html_to_json.py b/sphinxcontrib/serializinghtml/nav_html_to_json.py new file mode 100644 index 0000000..ff9b3d3 --- /dev/null +++ b/sphinxcontrib/serializinghtml/nav_html_to_json.py @@ -0,0 +1,54 @@ +from bs4 import BeautifulSoup, element +import json + +def section_links(parent_entry: element.Tag, list_entry: element.Tag) -> dict: + section_result = [] + for child in list_entry.children: + if type(child) is element.Tag and child.name == "li": + section_result.append(convert_tag_to_link(child)) + return { + "type": "expandable-link-group", + "text": parent_entry.contents[0].contents[0], + "href": parent_entry.contents[0]["href"], + "items": section_result + } + + +def convert_tag_to_link(item_entry: element.Tag) -> dict: + # The a tag is a child of the li tag + a_tag = item_entry.contents[0] + return { + "type": "link", + "text": a_tag.contents[0], + "href": a_tag["href"] + } + +def convert_nav_html_to_json(html: str) -> list: + result = [] + soup = BeautifulSoup(html, "html.parser") + + # Start with the unordered list + ul = soup.ul + pending_divider = False + # Iterate through list items + for child in ul.children: + if type(child) is element.Tag and child.name == "li": + # Is there a new unordered list within this section? + section = child.find_all("ul", limit=1) + if section != []: + # Yes, there is, so we have a sub-section. If we've got some content + # already, add a divider. + if result != []: + result.append({ "type": "divider" }) + # Now append the current page and the section links. The + # ul tag is the only child returned, hence [0] + result.append(section_links(child, section[0])) + # If there are any "normal" entries after this section + # add a divider first + pending_divider = True + else: + if pending_divider: + result.append({ "type": "divider" }) + pending_divider = False + result.append(convert_tag_to_link(child)) + return result From d2ea55f51560b399a7357e1994735868b5c17154 Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Wed, 31 Jul 2024 15:25:56 +0100 Subject: [PATCH 02/20] Update current_page_name to lose "/index" --- sphinxcontrib/serializinghtml/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index d6e83ab..747b72d 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -104,6 +104,7 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p page_filename = self.get_builder_config('project_name', 'html') else: page_filename = SEP.join(parts[:-1]) + ctx['current_page_name'] = page_filename else: page_filename = pagename outfilename = path.join(self.outdir, From 28e7ebdb55aa9ebff75c12b8ed98e8069a11d3dc Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Mon, 5 Aug 2024 08:45:04 +0100 Subject: [PATCH 03/20] Add dir/path support --- sphinxcontrib/serializinghtml/__init__.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index 747b72d..d037db4 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -55,7 +55,24 @@ class SerializingHTMLBuilder(StandaloneHTMLBuilder): def init(self) -> None: self.build_info = BuildInfo(self.config, self.tags) - self.imagedir = '_images' + # Cope with whether or not Sphinx has the required configuration variables + # set. + # See HTML Builder comments for explanation of image setup & handling + html_image_dir = None + try: + html_image_dir = self.get_builder_config('image_dir', 'html') + except AttributeError: + pass + if html_image_dir is not None: + self.imagedir = html_image_dir + else: + self.imagedir = '_images' + html_image_path = None + try: + html_image_path = self.get_builder_config('image_path', 'html') + except AttributeError: + pass + self.imagepath = html_image_path self.current_docname = '' self.theme = None # type: ignore[assignment] # no theme necessary self.templates = None # no template bridge necessary From b2d90cbfe0e1b6ca8a67bce8a58399fcd11c8ebf Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Mon, 5 Aug 2024 09:35:14 +0100 Subject: [PATCH 04/20] Add Linaro to version string --- sphinxcontrib/serializinghtml/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index d037db4..06dc166 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ... def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ... def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ... -__version__ = '2.0.0' +__version__ = '2.0.0+Linaro' __version_info__ = (2, 0, 0) package_dir = path.abspath(path.dirname(__file__)) From 62d87c1b89fad89034cb53e9d32ea40042884f59 Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Tue, 6 Aug 2024 14:47:12 +0100 Subject: [PATCH 05/20] Make URIs absolute for Solutions Hub --- sphinxcontrib/serializinghtml/__init__.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index 06dc166..8b3497e 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ... def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ... def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ... -__version__ = '2.0.0+Linaro' +__version__ = '2.0.0+Linaro-240806' __version_info__ = (2, 0, 0) package_dir = path.abspath(path.dirname(__file__)) @@ -83,11 +83,16 @@ def init(self) -> None: self.use_index = self.get_builder_config('use_index', 'html') def get_target_uri(self, docname: str, typ: str | None = None) -> str: + print(f"get_target_uri: {docname}") + # For the Solutions Hub, we want all URIs to be absolute. They will + # all be /library// except that "index" will + # get trimmed off + project_name = self.get_builder_config('project_name', 'html') if docname == 'index': - return '' + return f"/library/{project_name}/{project_name}" if docname.endswith(SEP + 'index'): - return docname[:-5] # up to sep - return docname + SEP + return f"/library/{project_name}/{docname[:-5]}" # up to sep + return f"/library/{project_name}/{docname}" def dump_context(self, context: dict[str, Any], filename: str | os.PathLike[str]) -> None: context = context.copy() From e8a198143c11a37f842c085d3cb0c91e33e7ff2a Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Tue, 6 Aug 2024 14:51:31 +0100 Subject: [PATCH 06/20] Remove print statement --- sphinxcontrib/serializinghtml/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index 8b3497e..622d12d 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -83,7 +83,6 @@ def init(self) -> None: self.use_index = self.get_builder_config('use_index', 'html') def get_target_uri(self, docname: str, typ: str | None = None) -> str: - print(f"get_target_uri: {docname}") # For the Solutions Hub, we want all URIs to be absolute. They will # all be /library// except that "index" will # get trimmed off From 11ea0ae28bf6d10be30559a54cbfcd2f8671de5a Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Tue, 6 Aug 2024 14:57:32 +0100 Subject: [PATCH 07/20] Trying another URI approach --- sphinxcontrib/serializinghtml/__init__.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index 622d12d..a58bef0 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -83,15 +83,11 @@ def init(self) -> None: self.use_index = self.get_builder_config('use_index', 'html') def get_target_uri(self, docname: str, typ: str | None = None) -> str: - # For the Solutions Hub, we want all URIs to be absolute. They will - # all be /library// except that "index" will - # get trimmed off - project_name = self.get_builder_config('project_name', 'html') if docname == 'index': - return f"/library/{project_name}/{project_name}" + return "/" if docname.endswith(SEP + 'index'): - return f"/library/{project_name}/{docname[:-5]}" # up to sep - return f"/library/{project_name}/{docname}" + return f"/{docname[:-5]}" # up to sep + return f"/{docname}" def dump_context(self, context: dict[str, Any], filename: str | os.PathLike[str]) -> None: context = context.copy() From d5e8f30dfe996aff47485d90ec1edcabf50d406b Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Tue, 6 Aug 2024 15:06:33 +0100 Subject: [PATCH 08/20] Completely revert URI changes. Didn't work. --- sphinxcontrib/serializinghtml/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index a58bef0..f037d4b 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -84,10 +84,10 @@ def init(self) -> None: def get_target_uri(self, docname: str, typ: str | None = None) -> str: if docname == 'index': - return "/" + return "" if docname.endswith(SEP + 'index'): - return f"/{docname[:-5]}" # up to sep - return f"/{docname}" + return docname[:-5] # up to sep + return docname def dump_context(self, context: dict[str, Any], filename: str | os.PathLike[str]) -> None: context = context.copy() From 559f8d49ec78d3bb5a21fe56a28bb9b5e3479293 Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Tue, 6 Aug 2024 16:13:49 +0100 Subject: [PATCH 09/20] Clean up navigation hrefs --- sphinxcontrib/serializinghtml/nav_html_to_json.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sphinxcontrib/serializinghtml/nav_html_to_json.py b/sphinxcontrib/serializinghtml/nav_html_to_json.py index ff9b3d3..9cbf8af 100644 --- a/sphinxcontrib/serializinghtml/nav_html_to_json.py +++ b/sphinxcontrib/serializinghtml/nav_html_to_json.py @@ -1,6 +1,14 @@ from bs4 import BeautifulSoup, element import json +def clean_href(href: str) -> str: + """ Make sure the href doesn't start or end with a / """ + if href[0] == "/": + href = href[1:] + if href[-1] == "/": + href = href[:-1] + return href + def section_links(parent_entry: element.Tag, list_entry: element.Tag) -> dict: section_result = [] for child in list_entry.children: @@ -9,7 +17,7 @@ def section_links(parent_entry: element.Tag, list_entry: element.Tag) -> dict: return { "type": "expandable-link-group", "text": parent_entry.contents[0].contents[0], - "href": parent_entry.contents[0]["href"], + "href": clean_href(parent_entry.contents[0]["href"]), "items": section_result } @@ -20,7 +28,7 @@ def convert_tag_to_link(item_entry: element.Tag) -> dict: return { "type": "link", "text": a_tag.contents[0], - "href": a_tag["href"] + "href": clean_href(a_tag["href"]) } def convert_nav_html_to_json(html: str) -> list: From 6ec2de085a2cf74338063cde1ab859bd61201fe3 Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Wed, 7 Aug 2024 15:22:21 +0100 Subject: [PATCH 10/20] Fix UL bug and bump date --- sphinxcontrib/serializinghtml/__init__.py | 2 +- .../serializinghtml/nav_html_to_json.py | 41 ++++++++++--------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index f037d4b..95ef5d1 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ... def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ... def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ... -__version__ = '2.0.0+Linaro-240806' +__version__ = '2.0.0+Linaro-240807' __version_info__ = (2, 0, 0) package_dir = path.abspath(path.dirname(__file__)) diff --git a/sphinxcontrib/serializinghtml/nav_html_to_json.py b/sphinxcontrib/serializinghtml/nav_html_to_json.py index 9cbf8af..98a92a0 100644 --- a/sphinxcontrib/serializinghtml/nav_html_to_json.py +++ b/sphinxcontrib/serializinghtml/nav_html_to_json.py @@ -39,24 +39,25 @@ def convert_nav_html_to_json(html: str) -> list: ul = soup.ul pending_divider = False # Iterate through list items - for child in ul.children: - if type(child) is element.Tag and child.name == "li": - # Is there a new unordered list within this section? - section = child.find_all("ul", limit=1) - if section != []: - # Yes, there is, so we have a sub-section. If we've got some content - # already, add a divider. - if result != []: - result.append({ "type": "divider" }) - # Now append the current page and the section links. The - # ul tag is the only child returned, hence [0] - result.append(section_links(child, section[0])) - # If there are any "normal" entries after this section - # add a divider first - pending_divider = True - else: - if pending_divider: - result.append({ "type": "divider" }) - pending_divider = False - result.append(convert_tag_to_link(child)) + if ul is not None: + for child in ul.children: + if type(child) is element.Tag and child.name == "li": + # Is there a new unordered list within this section? + section = child.find_all("ul", limit=1) + if section != []: + # Yes, there is, so we have a sub-section. If we've got some content + # already, add a divider. + if result != []: + result.append({ "type": "divider" }) + # Now append the current page and the section links. The + # ul tag is the only child returned, hence [0] + result.append(section_links(child, section[0])) + # If there are any "normal" entries after this section + # add a divider first + pending_divider = True + else: + if pending_divider: + result.append({ "type": "divider" }) + pending_divider = False + result.append(convert_tag_to_link(child)) return result From 73c32d51b69a1156ad6eb158c1753f8475c46587 Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Wed, 4 Sep 2024 09:19:50 +0100 Subject: [PATCH 11/20] Handle TOCs with multiple UL sections --- sphinxcontrib/serializinghtml/__init__.py | 2 +- sphinxcontrib/serializinghtml/nav_html_to_json.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index 95ef5d1..373af65 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ... def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ... def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ... -__version__ = '2.0.0+Linaro-240807' +__version__ = '2.0.0+Linaro-240904' __version_info__ = (2, 0, 0) package_dir = path.abspath(path.dirname(__file__)) diff --git a/sphinxcontrib/serializinghtml/nav_html_to_json.py b/sphinxcontrib/serializinghtml/nav_html_to_json.py index 98a92a0..b212c78 100644 --- a/sphinxcontrib/serializinghtml/nav_html_to_json.py +++ b/sphinxcontrib/serializinghtml/nav_html_to_json.py @@ -1,5 +1,6 @@ from bs4 import BeautifulSoup, element import json +import sys def clean_href(href: str) -> str: """ Make sure the href doesn't start or end with a / """ @@ -39,7 +40,7 @@ def convert_nav_html_to_json(html: str) -> list: ul = soup.ul pending_divider = False # Iterate through list items - if ul is not None: + while ul is not None: for child in ul.children: if type(child) is element.Tag and child.name == "li": # Is there a new unordered list within this section? @@ -60,4 +61,9 @@ def convert_nav_html_to_json(html: str) -> list: result.append({ "type": "divider" }) pending_divider = False result.append(convert_tag_to_link(child)) + while True: + ul = ul.next_sibling + if ul is None or type(ul) is element.Tag: + break + # Not an acceptable type - loop and get the next sibling return result From 766614f02af0c2b4921ae2bfa7d2666a098711cc Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Wed, 4 Sep 2024 14:44:53 +0100 Subject: [PATCH 12/20] Implement fix for entities in img alt text --- sphinxcontrib/serializinghtml/__init__.py | 11 +++++++++-- .../{nav_html_to_json.py => html_assists.py} | 16 +++++++++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) rename sphinxcontrib/serializinghtml/{nav_html_to_json.py => html_assists.py} (81%) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index 373af65..72537a1 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -11,7 +11,7 @@ from sphinx.locale import get_translation from sphinx.util.osutil import SEP, copyfile, ensuredir, os_path -from sphinxcontrib.serializinghtml import jsonimpl, nav_html_to_json +from sphinxcontrib.serializinghtml import html_assists, jsonimpl if TYPE_CHECKING: from collections.abc import Sequence @@ -109,7 +109,7 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p self.add_sidebars(pagename, ctx) # Add the toc tree as a JSON dictionary - ctx['toctree'] = nav_html_to_json.convert_nav_html_to_json(self._get_local_toctree(pagename)) + ctx['toctree'] = html_assists.convert_nav_html_to_json(self._get_local_toctree(pagename)) if not outfilename: # PJC: Ensure that index files are actually written under the name of the @@ -136,6 +136,13 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p if isinstance(ctx[key], types.FunctionType): del ctx[key] + # PJC: Some Linaro documentation has encoded attributes in image ALT text + # which then gets decoded when the HTML is loaded into the DOM, so + # we need to alter it by "escaping" the ampersands with & to + # prevent the decoding. + if "body" in ctx: + ctx['body'] = html_assists.escape_encoded_alt_text(ctx['body']) + ensuredir(path.dirname(outfilename)) self.dump_context(ctx, outfilename) diff --git a/sphinxcontrib/serializinghtml/nav_html_to_json.py b/sphinxcontrib/serializinghtml/html_assists.py similarity index 81% rename from sphinxcontrib/serializinghtml/nav_html_to_json.py rename to sphinxcontrib/serializinghtml/html_assists.py index b212c78..3679940 100644 --- a/sphinxcontrib/serializinghtml/nav_html_to_json.py +++ b/sphinxcontrib/serializinghtml/html_assists.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup, element -import json import sys +from html import escape def clean_href(href: str) -> str: """ Make sure the href doesn't start or end with a / """ @@ -32,6 +32,20 @@ def convert_tag_to_link(item_entry: element.Tag) -> dict: "href": clean_href(a_tag["href"]) } +def escape_encoded_alt_text(html: str) -> str: + soup = BeautifulSoup(html, "html.parser") + images = soup.find_all('img') + for img in images: + if img['alt'] != "": + # At this point, Beautiful Soup has done what a browser does - decode + # any encoded attributes. So we need to re-encode the string, see if + # there are any ampersands and, if so, re-encode them again. + interim = escape(img['alt']) + if interim.find("&") != -1: + img['alt'] = escape(interim) + + return html + def convert_nav_html_to_json(html: str) -> list: result = [] soup = BeautifulSoup(html, "html.parser") From 2e0a128df58163634e27e3215cebb861f590d2c7 Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Wed, 4 Sep 2024 14:45:13 +0100 Subject: [PATCH 13/20] Bump version --- sphinxcontrib/serializinghtml/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index 72537a1..a973d72 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ... def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ... def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ... -__version__ = '2.0.0+Linaro-240904' +__version__ = '2.0.0+Linaro-240904a' __version_info__ = (2, 0, 0) package_dir = path.abspath(path.dirname(__file__)) From 6e35e86a12719a9bc757802fb96dc4eccc4aa249 Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Wed, 4 Sep 2024 14:57:46 +0100 Subject: [PATCH 14/20] Fix bug around editing img alt text --- sphinxcontrib/serializinghtml/html_assists.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sphinxcontrib/serializinghtml/html_assists.py b/sphinxcontrib/serializinghtml/html_assists.py index 3679940..e3bcf49 100644 --- a/sphinxcontrib/serializinghtml/html_assists.py +++ b/sphinxcontrib/serializinghtml/html_assists.py @@ -33,6 +33,7 @@ def convert_tag_to_link(item_entry: element.Tag) -> dict: } def escape_encoded_alt_text(html: str) -> str: + edited = False soup = BeautifulSoup(html, "html.parser") images = soup.find_all('img') for img in images: @@ -43,7 +44,10 @@ def escape_encoded_alt_text(html: str) -> str: interim = escape(img['alt']) if interim.find("&") != -1: img['alt'] = escape(interim) + edited = True + if edited: + html = str(soup) return html def convert_nav_html_to_json(html: str) -> list: From 4585595ce5a8ef872601a3c5abb22b4bb9956d33 Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Wed, 4 Sep 2024 15:03:05 +0100 Subject: [PATCH 15/20] Bump version --- sphinxcontrib/serializinghtml/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index a973d72..b360030 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ... def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ... def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ... -__version__ = '2.0.0+Linaro-240904a' +__version__ = '2.0.0+Linaro-240904b' __version_info__ = (2, 0, 0) package_dir = path.abspath(path.dirname(__file__)) From 9445c448e5132a60c44cdaa53972dc4addedbe7c Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Thu, 24 Oct 2024 12:38:29 +0100 Subject: [PATCH 16/20] Fix encoded attributes in pre sections --- sphinxcontrib/serializinghtml/__init__.py | 15 ++- sphinxcontrib/serializinghtml/html_assists.py | 98 ++++++++++++------- 2 files changed, 74 insertions(+), 39 deletions(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index b360030..bc83403 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ... def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ... def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ... -__version__ = '2.0.0+Linaro-240904b' +__version__ = '2.0.0+Linaro-241024' __version_info__ = (2, 0, 0) package_dir = path.abspath(path.dirname(__file__)) @@ -104,6 +104,7 @@ def dump_context(self, context: dict[str, Any], filename: str | os.PathLike[str] def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'page.html', outfilename: str | None = None, event_arg: Any = None) -> None: + print(f"handle_page: {pagename}") ctx['current_page_name'] = pagename ctx.setdefault('pathto', lambda p: p) self.add_sidebars(pagename, ctx) @@ -136,12 +137,16 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p if isinstance(ctx[key], types.FunctionType): del ctx[key] - # PJC: Some Linaro documentation has encoded attributes in image ALT text - # which then gets decoded when the HTML is loaded into the DOM, so - # we need to alter it by "escaping" the ampersands with & to - # prevent the decoding. if "body" in ctx: + # PJC: Some Linaro documentation has encoded attributes in image ALT text + # which then gets decoded when the HTML is loaded into the DOM, so + # we need to alter it by "escaping" the ampersands with & to + # prevent the decoding. ctx['body'] = html_assists.escape_encoded_alt_text(ctx['body']) + # PJC: Furthermore, if there is any formatted code with encoded attributes, + # e.g. < changed to < then that also needs to be escaped because it is + # also getting decoded. + ctx['body'] = html_assists.escape_encoded_pre_text(ctx['body']) ensuredir(path.dirname(outfilename)) self.dump_context(ctx, outfilename) diff --git a/sphinxcontrib/serializinghtml/html_assists.py b/sphinxcontrib/serializinghtml/html_assists.py index e3bcf49..b1e6204 100644 --- a/sphinxcontrib/serializinghtml/html_assists.py +++ b/sphinxcontrib/serializinghtml/html_assists.py @@ -22,7 +22,6 @@ def section_links(parent_entry: element.Tag, list_entry: element.Tag) -> dict: "items": section_result } - def convert_tag_to_link(item_entry: element.Tag) -> dict: # The a tag is a child of the li tag a_tag = item_entry.contents[0] @@ -32,6 +31,48 @@ def convert_tag_to_link(item_entry: element.Tag) -> dict: "href": clean_href(a_tag["href"]) } +def process_section(result, child, section, pending_divider) -> bool: + if section != []: + # Yes, there is, so we have a sub-section. If we've got some content + # already, add a divider. + if result != []: + result.append({ "type": "divider" }) + # Now append the current page and the section links. The + # ul tag is the only child returned, hence [0] + result.append(section_links(child, section[0])) + # If there are any "normal" entries after this section + # add a divider first + pending_divider = True + else: + if pending_divider: + result.append({ "type": "divider" }) + pending_divider = False + result.append(convert_tag_to_link(child)) + +def process_ul_children(result, ul): + pending_divider = False + for child in ul.children: + if type(child) is element.Tag and child.name == "li": + # Is there a new unordered list within this section? + section = child.find_all("ul", limit=1) + pending_divider = process_section(result, child, section, pending_divider) + +def convert_nav_html_to_json(html: str) -> list: + result = [] + soup = BeautifulSoup(html, "html.parser") + + # Start with the unordered list + ul = soup.ul + # Iterate through list items + while ul is not None: + process_ul_children(result, ul) + while True: + ul = ul.next_sibling + if ul is None or type(ul) is element.Tag: + break + # Not an acceptable type - loop and get the next sibling + return result + def escape_encoded_alt_text(html: str) -> str: edited = False soup = BeautifulSoup(html, "html.parser") @@ -50,38 +91,27 @@ def escape_encoded_alt_text(html: str) -> str: html = str(soup) return html -def convert_nav_html_to_json(html: str) -> list: - result = [] +def escape_encoded_pre_text(html: str) -> str: + print("escape_encoded_pre_text") + edited = False soup = BeautifulSoup(html, "html.parser") + spans = soup.find_all('span') + for span in spans: + classes = span["class"] + matched_pre = False + for this_class in classes: + if this_class == "pre": + matched_pre = True + if matched_pre: + # At this point, Beautiful Soup has done what a browser does - decode + # any encoded attributes. So we need to re-encode the string, see if + # there are any ampersands and, if so, re-encode them again. + interim = escape(span.string) + if interim.find("&") != -1: + span.string = escape(interim) + edited = True - # Start with the unordered list - ul = soup.ul - pending_divider = False - # Iterate through list items - while ul is not None: - for child in ul.children: - if type(child) is element.Tag and child.name == "li": - # Is there a new unordered list within this section? - section = child.find_all("ul", limit=1) - if section != []: - # Yes, there is, so we have a sub-section. If we've got some content - # already, add a divider. - if result != []: - result.append({ "type": "divider" }) - # Now append the current page and the section links. The - # ul tag is the only child returned, hence [0] - result.append(section_links(child, section[0])) - # If there are any "normal" entries after this section - # add a divider first - pending_divider = True - else: - if pending_divider: - result.append({ "type": "divider" }) - pending_divider = False - result.append(convert_tag_to_link(child)) - while True: - ul = ul.next_sibling - if ul is None or type(ul) is element.Tag: - break - # Not an acceptable type - loop and get the next sibling - return result + if edited: + html = str(soup) + print(html) + return html From c47f1eb651401db3f0230dfa8ab20758ae116baf Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Thu, 24 Oct 2024 13:55:30 +0100 Subject: [PATCH 17/20] Fix bugs --- sphinxcontrib/serializinghtml/__init__.py | 2 +- sphinxcontrib/serializinghtml/html_assists.py | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index bc83403..545fce0 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ... def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ... def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ... -__version__ = '2.0.0+Linaro-241024' +__version__ = '2.0.0+Linaro-241024a' __version_info__ = (2, 0, 0) package_dir = path.abspath(path.dirname(__file__)) diff --git a/sphinxcontrib/serializinghtml/html_assists.py b/sphinxcontrib/serializinghtml/html_assists.py index b1e6204..36f5fc1 100644 --- a/sphinxcontrib/serializinghtml/html_assists.py +++ b/sphinxcontrib/serializinghtml/html_assists.py @@ -91,18 +91,22 @@ def escape_encoded_alt_text(html: str) -> str: html = str(soup) return html +def matched_pre(span) -> bool: + """ Check if this span is specifying the "pre" class """ + if "class" not in span: + return False + classes = span["class"] + for this_class in classes: + if this_class == "pre": + return True + return False + def escape_encoded_pre_text(html: str) -> str: - print("escape_encoded_pre_text") edited = False soup = BeautifulSoup(html, "html.parser") spans = soup.find_all('span') for span in spans: - classes = span["class"] - matched_pre = False - for this_class in classes: - if this_class == "pre": - matched_pre = True - if matched_pre: + if matched_pre(span): # At this point, Beautiful Soup has done what a browser does - decode # any encoded attributes. So we need to re-encode the string, see if # there are any ampersands and, if so, re-encode them again. @@ -113,5 +117,4 @@ def escape_encoded_pre_text(html: str) -> str: if edited: html = str(soup) - print(html) return html From 05dd739f49a9ededa88b9f800e1396d4f6c932ca Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Mon, 28 Oct 2024 09:20:26 +0000 Subject: [PATCH 18/20] Fix pre handling --- sphinxcontrib/serializinghtml/__init__.py | 3 +-- sphinxcontrib/serializinghtml/html_assists.py | 27 ++++++------------- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index 545fce0..36d5a22 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ... def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ... def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ... -__version__ = '2.0.0+Linaro-241024a' +__version__ = '2.0.0+Linaro-241028' __version_info__ = (2, 0, 0) package_dir = path.abspath(path.dirname(__file__)) @@ -104,7 +104,6 @@ def dump_context(self, context: dict[str, Any], filename: str | os.PathLike[str] def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'page.html', outfilename: str | None = None, event_arg: Any = None) -> None: - print(f"handle_page: {pagename}") ctx['current_page_name'] = pagename ctx.setdefault('pathto', lambda p: p) self.add_sidebars(pagename, ctx) diff --git a/sphinxcontrib/serializinghtml/html_assists.py b/sphinxcontrib/serializinghtml/html_assists.py index 36f5fc1..c818c5e 100644 --- a/sphinxcontrib/serializinghtml/html_assists.py +++ b/sphinxcontrib/serializinghtml/html_assists.py @@ -91,29 +91,18 @@ def escape_encoded_alt_text(html: str) -> str: html = str(soup) return html -def matched_pre(span) -> bool: - """ Check if this span is specifying the "pre" class """ - if "class" not in span: - return False - classes = span["class"] - for this_class in classes: - if this_class == "pre": - return True - return False - def escape_encoded_pre_text(html: str) -> str: edited = False soup = BeautifulSoup(html, "html.parser") - spans = soup.find_all('span') + spans = soup.find_all('span', class_="pre") for span in spans: - if matched_pre(span): - # At this point, Beautiful Soup has done what a browser does - decode - # any encoded attributes. So we need to re-encode the string, see if - # there are any ampersands and, if so, re-encode them again. - interim = escape(span.string) - if interim.find("&") != -1: - span.string = escape(interim) - edited = True + # At this point, Beautiful Soup has done what a browser does - decode + # any encoded attributes. So we need to re-encode the string, see if + # there are any ampersands and, if so, re-encode them again. + interim = escape(span.string) + if interim.find("&") != -1: + span.string = escape(interim) + edited = True if edited: html = str(soup) From 26cd446e097f3b5f494268ee24be09b4e6f269e9 Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Tue, 29 Oct 2024 15:27:01 +0000 Subject: [PATCH 19/20] Return updated divider flag --- sphinxcontrib/serializinghtml/html_assists.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sphinxcontrib/serializinghtml/html_assists.py b/sphinxcontrib/serializinghtml/html_assists.py index c818c5e..fb015da 100644 --- a/sphinxcontrib/serializinghtml/html_assists.py +++ b/sphinxcontrib/serializinghtml/html_assists.py @@ -48,6 +48,7 @@ def process_section(result, child, section, pending_divider) -> bool: result.append({ "type": "divider" }) pending_divider = False result.append(convert_tag_to_link(child)) + return pending_divider def process_ul_children(result, ul): pending_divider = False From 2aa3106f04801a07aee5f8b887e73f1670ccd5b6 Mon Sep 17 00:00:00 2001 From: Philip Colmer Date: Thu, 8 May 2025 14:47:53 +0100 Subject: [PATCH 20/20] Add support to map external links to relative local links --- sphinxcontrib/serializinghtml/__init__.py | 12 ++++++++++++ sphinxcontrib/serializinghtml/html_assists.py | 17 +++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index 36d5a22..c44e1f7 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -81,6 +81,15 @@ def init(self) -> None: self.init_css_files() self.init_js_files() self.use_index = self.get_builder_config('use_index', 'html') + # + # PJC: New configuration to allow mapping of external links to + # relative Hub links. + link_mappings = None + try: + link_mappings = self.get_builder_config('link_mappings', 'html') + except AttributeError: + pass + self.link_mappings = link_mappings def get_target_uri(self, docname: str, typ: str | None = None) -> str: if docname == 'index': @@ -146,6 +155,9 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p # e.g. < changed to < then that also needs to be escaped because it is # also getting decoded. ctx['body'] = html_assists.escape_encoded_pre_text(ctx['body']) + # PJC: Go through the body, looking for any tags to see if they + # need to be re-mapped to a local Hub path. + ctx['body'] = html_assists.rewrite_hub_links(ctx['body'], self.link_mappings) ensuredir(path.dirname(outfilename)) self.dump_context(ctx, outfilename) diff --git a/sphinxcontrib/serializinghtml/html_assists.py b/sphinxcontrib/serializinghtml/html_assists.py index fb015da..1a1940e 100644 --- a/sphinxcontrib/serializinghtml/html_assists.py +++ b/sphinxcontrib/serializinghtml/html_assists.py @@ -108,3 +108,20 @@ def escape_encoded_pre_text(html: str) -> str: if edited: html = str(soup) return html + +def rewrite_hub_links(html: str, link_mappings: dict) -> str: + edited = False + soup = BeautifulSoup(html, "html.parser") + links = soup.find_all('a') + for link in links: + for key in link_mappings: + if link['href'].startswith(key): + # We have a match, so replace the href with the new one + link['href'] = link['href'].replace(key, link_mappings[key]) + # We also have to remove ".html" from the end of the link + link['href'] = link['href'].replace(".html", "") + edited = True + + if edited: + html = str(soup) + return html