diff --git a/README.rst b/README.rst index d9ed40a..78eb63a 100644 --- a/README.rst +++ b/README.rst @@ -1,22 +1,5 @@ -============================= -sphinxcontrib-serializinghtml -============================= +This is a fork of https://github.com/sphinx-doc/sphinxcontrib-serializinghtml -sphinxcontrib-serializinghtml is a sphinx extension which outputs -"serialized" HTML files (json and pickle). +Changes made to this fork are to facilitate the creation of JSON files suitable for consumption by React. -For more details, please visit http://www.sphinx-doc.org/. - -Installing -========== - -Install from PyPI:: - - pip install -U sphinxcontrib-serializinghtml - -Contributing -============ - -See `CONTRIBUTING.rst`__ - -.. __: https://github.com/sphinx-doc/sphinx/blob/master/CONTRIBUTING.rst +Since those changes are very specific, they have not been contributed back to the original repo. diff --git a/pyproject.toml b/pyproject.toml index f14054e..a8a4329 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ classifiers = [ "Topic :: Text Processing", "Topic :: Utilities", ] -dependencies = [] +dependencies = ["beautifulsoup4"] dynamic = ["version"] [project.optional-dependencies] diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index bdbeb6f..2adf9ae 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -11,7 +11,7 @@ from sphinx.locale import get_translation from sphinx.util.osutil import SEP, copyfile, ensuredir, os_path -from sphinxcontrib.serializinghtml import jsonimpl +from sphinxcontrib.serializinghtml import html_assists, jsonimpl if TYPE_CHECKING: from collections.abc import Sequence @@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ... def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ... def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ... -__version__ = '2.0.0' +__version__ = '2.0.0+Linaro-250828' __version_info__ = (2, 0, 0) package_dir = path.abspath(path.dirname(__file__)) @@ -55,7 +55,24 @@ class SerializingHTMLBuilder(StandaloneHTMLBuilder): def init(self) -> None: self.build_info = BuildInfo(self.config, self.tags) - self.imagedir = '_images' + # Cope with whether or not Sphinx has the required configuration variables + # set. + # See HTML Builder comments for explanation of image setup & handling + html_image_dir = None + try: + html_image_dir = self.get_builder_config('image_dir', 'html') + except AttributeError: + pass + if html_image_dir is not None: + self.imagedir = html_image_dir + else: + self.imagedir = '_images' + html_image_path = None + try: + html_image_path = self.get_builder_config('image_path', 'html') + except AttributeError: + pass + self.imagepath = html_image_path self.current_docname = '' self.theme = None # type: ignore[assignment] # no theme necessary self.templates = None # no template bridge necessary @@ -64,13 +81,22 @@ def init(self) -> None: self.init_css_files() self.init_js_files() self.use_index = self.get_builder_config('use_index', 'html') + # + # PJC: New configuration to allow mapping of external links to + # relative Hub links. + link_mappings = None + try: + link_mappings = self.get_builder_config('link_mappings', 'html') + except AttributeError: + pass + self.link_mappings = link_mappings def get_target_uri(self, docname: str, typ: str | None = None) -> str: if docname == 'index': - return '' + return "" if docname.endswith(SEP + 'index'): return docname[:-5] # up to sep - return docname + SEP + return docname def dump_context(self, context: dict[str, Any], filename: str | os.PathLike[str]) -> None: context = context.copy() @@ -91,9 +117,24 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p ctx.setdefault('pathto', lambda p: p) self.add_sidebars(pagename, ctx) + # Add the toc tree as a JSON dictionary + ctx['toctree'] = html_assists.convert_nav_html_to_json(self._get_local_toctree(pagename, includehidden=True)) + if not outfilename: + # PJC: Ensure that index files are actually written under the name of the + # directory leafname. + parts = pagename.split(SEP) + if parts[len(parts)-1] == "index": + if len(parts) == 1: + # Use the project name + page_filename = self.get_builder_config('project_name', 'html') + else: + page_filename = SEP.join(parts[:-1]) + ctx['current_page_name'] = page_filename + else: + page_filename = pagename outfilename = path.join(self.outdir, - os_path(pagename) + self.out_suffix) + os_path(page_filename) + self.out_suffix) # we're not taking the return value here, since no template is # actually rendered @@ -104,6 +145,20 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p if isinstance(ctx[key], types.FunctionType): del ctx[key] + if "body" in ctx: + # PJC: Some Linaro documentation has encoded attributes in image ALT text + # which then gets decoded when the HTML is loaded into the DOM, so + # we need to alter it by "escaping" the ampersands with & to + # prevent the decoding. + ctx['body'] = html_assists.escape_encoded_alt_text(ctx['body']) + # PJC: Furthermore, if there is any formatted code with encoded attributes, + # e.g. < changed to < then that also needs to be escaped because it is + # also getting decoded. + ctx['body'] = html_assists.escape_encoded_pre_text(ctx['body']) + # PJC: Go through the body, looking for any tags to see if they + # need to be re-mapped to a local Hub path. + ctx['body'] = html_assists.rewrite_hub_links(ctx['body'], self.link_mappings, page_filename) + ensuredir(path.dirname(outfilename)) self.dump_context(ctx, outfilename) @@ -161,7 +216,7 @@ class JSONHTMLBuilder(SerializingHTMLBuilder): implementation_dumps_unicode = True indexer_format = jsonimpl indexer_dumps_unicode = True - out_suffix = '.fjson' + out_suffix = '.json' globalcontext_filename = 'globalcontext.json' searchindex_filename = 'searchindex.json' diff --git a/sphinxcontrib/serializinghtml/html_assists.py b/sphinxcontrib/serializinghtml/html_assists.py new file mode 100644 index 0000000..5978574 --- /dev/null +++ b/sphinxcontrib/serializinghtml/html_assists.py @@ -0,0 +1,230 @@ +from bs4 import BeautifulSoup, element +from html import escape +from urllib.parse import urlparse +from pathlib import PurePosixPath + +def is_relative_url(url): + parsed = urlparse(url) + return not parsed.scheme and not parsed.netloc + +def clean_href(href: str) -> str: + """ Make sure the href doesn't start or end with a / """ + if href[0] == "/": + href = href[1:] + if href[-1] == "/": + href = href[:-1] + return href + +def section_links(parent_entry: element.Tag, list_entry: element.Tag) -> dict: + section_result = [] + for child in list_entry.children: + if type(child) is element.Tag and child.name == "li": + section_result.append(convert_tag_to_link(child)) + return { + "type": "section", + "text": parent_entry.contents[0].contents[0], + "items": section_result + } + +def convert_tag_to_link(item_entry: element.Tag) -> dict: + # The a tag is a child of the li tag + a_tag = item_entry.contents[0] + return { + "type": "link", + "text": a_tag.contents[0], + "href": clean_href(a_tag["href"]) + } + +def process_section(result, child, section): + # Is there a new unordered list within this section? + if section != []: + # Only add a starting divider if there is already content + if result != []: + result.append({ "type": "divider" }) + # Now append the current page and the section links. The + # ul tag is the only child returned, hence [0] + result.append(section_links(child, section[0])) + result.append({ "type": "divider" }) + else: + result.append(convert_tag_to_link(child)) + +def process_ul_children(result, ul): + for child in ul.children: + if type(child) is element.Tag and child.name == "li": + section = child.find_all("ul", limit=1) + process_section(result, child, section) + +def convert_nav_html_to_json(html: str) -> list: + result = [] + soup = BeautifulSoup(html, "html.parser") + top_level_tags = soup.find_all(recursive=False) + + caption = None + for tag in top_level_tags: + if type(tag) is element.Tag and tag.name == "p" and tag.has_attr("class") and "caption" in tag["class"]: + span = tag.findChild("span") + caption = span.text + elif type(tag) is element.Tag and tag.name == "ul": + if caption is not None: + local_result = [] + process_ul_children(local_result, tag) + result.append({ + "type": "section-group", + "title": caption, + "items": local_result + }) + caption = None + else: + process_ul_children(result, tag) + return result + +def escape_encoded_alt_text(html: str) -> str: + edited = False + soup = BeautifulSoup(html, "html.parser") + images = soup.find_all('img') + for img in images: + if img['alt'] != "": + # At this point, Beautiful Soup has done what a browser does - decode + # any encoded attributes. So we need to re-encode the string, see if + # there are any ampersands and, if so, re-encode them again. + interim = escape(img['alt']) + if interim.find("&") != -1: + img['alt'] = escape(interim) + edited = True + + if edited: + html = str(soup) + return html + +def re_encode_span_tags(span_tags, edited) -> bool: + for span_tag in span_tags: + content = span_tag.string + if content is not None: + interim = escape(content) + if interim.find("&") != -1: + span_tag.string = escape(interim) + edited = True + return edited + +def escape_encoded_pre_text(html: str) -> str: + # The reason for this function is because, when the browser loads the + # HTML from the JSON data, it decodes any encoded attributes, such as + # < and >, so we need to re-encode them to prevent the browser + # from decoding them. + # + # There are two separate search cases that are implemented here: + # + # 1. The tags that are used to format code in the HTML, which + # are used in the "pre" tags. + # 2. The
 tags themselves, which may contain code that has been
+    #    formatted with HTML entities, such as < and >.
+
+    edited = False
+    soup = BeautifulSoup(html, "html.parser")
+
+    span_tags = soup.find_all('span', class_="pre")
+    edited = re_encode_span_tags(span_tags, edited)
+
+    pre_tags = soup.find_all('pre')
+    for pre_tag in pre_tags:
+        span_tags = pre_tag.find_all("span")
+        edited = re_encode_span_tags(span_tags, edited)
+
+    if edited:
+        html = str(soup)
+    return html
+
+def relative_traversal(from_path, to_path):
+    from_parts = PurePosixPath(from_path).parts
+    to_parts = PurePosixPath(to_path).parts
+
+    # Find common prefix length
+    common_length = 0
+    for f, t in zip(from_parts, to_parts):
+        if f == t:
+            common_length += 1
+        else:
+            break
+
+    # Steps up from 'from_path' to common ancestor
+    # Need to reduce the step count by one because of the way
+    # Next.js handles routes.
+    up_steps = len(from_parts) - common_length - 1
+    down_path = to_parts[common_length:]
+
+    result = "../" * up_steps + "/".join(down_path)
+    return result
+
+def process_relative_links(link: dict, page_filename: str, page_filename_head: str) -> bool:
+    # Check for relative links that need adjusting relative to where
+    # we are in the URL structure. Do this *before* performing the link
+    # mapping because the latter introduces more relative links to check.
+    href_link = link['href']
+    print(f"rewrite_hub_links: adjusting relative link: {href_link}")
+    if page_filename_head != page_filename:
+        if is_relative_url(href_link) and href_link[0] not in ['#', '/']:
+            if href_link.startswith(page_filename_head):
+                # We need to drop the bit that goes up to the first / in
+                # the link because otherwise it gets duplicated when
+                # Next.js processes it.
+                link['href'] = href_link[len(page_filename_head)+1:]
+                print(f"rewrite_hub_links: new relative link: {link['href']}")
+                return True
+            # If we aren't on the same path, and we don't have any traversal
+            # at the start of the path, calculate the traversal required.
+            if not href_link.startswith("../"):
+                new_path = relative_traversal(page_filename, href_link)
+                if new_path != href_link:
+                    link['href'] = new_path
+                    print(f"rewrite_hub_links: new relative link: {link['href']}")
+                    return True
+            print("rewrite_hub_links: no change")
+    else:
+        print("rewrite_hub_links: no relative link adjustment needed")
+    return False
+
+def process_link_mappings(link: dict, link_mappings: dict) -> bool:
+    for key in link_mappings:
+        # Check if the href starts with the key
+        if link['href'].startswith(key):
+            print(f"process_link_mappings: matched {link['href']} against mapping")
+            # We have a match, so strip the key from the href
+            link['href'] = link['href'].replace(key, "")
+            print(f"process_link_mappings: after removing key, we're left with {link['href']}")
+            # We also have to remove ".html" from the end of the link
+            link['href'] = link['href'].replace(".html", "")
+            # If we're just left with "index", or if we have nothing left, replace it
+            # with the value from the dictionary, which will also be the documentation
+            # root name
+            if link['href'] == "index" or link['href'] == "":
+                link['href'] = link_mappings[key]
+            # Do we have a link that ENDS with "/index"? If we do, remove it
+            if link['href'].endswith("/index"):
+                link['href'] = link['href'].replace("/index", "")
+            # Now put it all together ...
+            # So we should end up with something like:
+            # /library/onelab/onelab
+            # /library/laa/laa_getting_started
+            link['href'] = f"/library/{link_mappings[key]}/{link['href']}"
+            print(f"process_link_mappings: mapped to {link['href']}")
+            return True
+    return False
+
+def rewrite_hub_links(html: str, link_mappings: dict, page_filename: str) -> str:
+    print(f"rewrite_hub_links: page_filename={page_filename}")
+    edited = False
+    soup = BeautifulSoup(html, "html.parser")
+    links = soup.find_all('a')
+    # Need to calculate what the start of page_filename looks like
+    # up to the first separator.
+    page_filename_head, _, _ = page_filename.partition("/")
+    print(f"rewrite_hub_links: page_filename_head={page_filename_head}")
+    for link in links:
+        if process_relative_links(link, page_filename, page_filename_head):
+            edited = True
+        if process_link_mappings(link, link_mappings):
+            edited = True
+
+    if edited:
+        html = str(soup)
+    return html