diff --git a/README.md b/README.md index f094a35..a3026af 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,7 @@ The following subcommands are available: * `--clean`: Runs the cleanup subcommand before running * `--only`/`--exclude`: Limits which sections get inlined during processing * `--no-draft`: Turns off the draft waterman on the PDF + * `--diff from_ref [to_ref]`: Build a document showing changes between two Git refs (e.g. commits, branches, or tags). If `to_ref` is omitted it defaults to `HEAD`. The build uses temporary worktrees to produce combined markdown for each ref, diffs the Pandoc ASTs, then runs the usual pipeline on the annotated diff; output files are named like `diff__.pdf`. * `clean`: Cleans any build artifacts. * `lint`: Lints the build output for common issues. * `export`: Exports the git archive to a zip for sharing. @@ -103,6 +104,17 @@ The following subcommands are available: Only the `build` subcommand is routinely tested and supported. The others are convenience methods and may or may not work. +#### Diff workflow (`--diff`) + +When `--diff from_ref [to_ref]` is used, the builder does not build from the current working tree. Instead it: + +1. Creates temporary Git worktrees at `from_ref` and `to_ref` (defaulting `to_ref` to `HEAD`). +2. For each ref, runs the usual preprocessing (flatten specification into a single combined markdown file). +3. Converts each combined markdown to a Pandoc JSON AST, diffs the two ASTs to produce an annotated diff (added/removed blocks), and converts the diff AST back to markdown. +4. Runs the normal Pandoc pipeline (filters, PDF/HTML/DOCX) on that combined diff markdown. + +Outputs are written under the normal build output directory with base name `diff__` so they do not overwrite a regular build. + ### Dependencies Dependencies are automatically installed by [pixi](https://pixi.sh), and should work on macOS/Linux/Windows. diff --git a/doc_build/ast_diff.py b/doc_build/ast_diff.py new file mode 100644 index 0000000..a851ad2 --- /dev/null +++ b/doc_build/ast_diff.py @@ -0,0 +1,144 @@ +"""Pandoc AST differencing. Core logic for the Pandoc AST Differencing Tool. + +The core logic uses the Longest Common Subsequence (LCS) algorithm to align +the block-level elements of the two documents. + +- **Added** blocks from the new file are included and marked. +- **Removed** blocks from the old file are included and marked. +- **Changed** blocks are detected by a direct comparison of common elements and + are included (from the new version) with a 'changed' mark. + +Metadata is added by wrapping the target block in a Pandoc 'Div' element +with the attribute `diff=`, where status is one of: +- 'added' +- 'removed' +""" + +import json + +from typing import List, Dict, Any, Tuple + +PandocNode = Dict[str, Any] +PandocAst = Dict[str, Any] +NodeList = List[PandocNode] + + +def add_diff_meta(node: PandocNode, status: str) -> PandocNode: + """ + Wraps a Pandoc AST node in a Div block to add diff metadata. + + This is the standard Pandoc method for adding block-level attributes. + + Args: + node: The Pandoc node to wrap. + status: The difference status ('added', 'removed'). + + Returns: + A new 'Div' node containing the original node and the diff attribute. + """ + attr: Tuple[str, List[str], List[Tuple[str, str]]] = ("", [], [("diff", status)]) + + return {"t": "Div", "c": [attr, [node]]} + + +def find_longest_common_subsequence(list_a: NodeList, list_b: NodeList) -> NodeList: + """ + Computes the Longest Common Subsequence (LCS) of two lists of nodes. + + This uses a classic dynamic programming approach. The nodes are compared + for deep equality. + """ + m, n = len(list_a), len(list_b) + dp = [[[] for _ in range(n + 1)] for _ in range(m + 1)] + a_strs = [json.dumps(node, sort_keys=True) for node in list_a] + b_strs = [json.dumps(node, sort_keys=True) for node in list_b] + + for i in range(1, m + 1): + for j in range(1, n + 1): + if a_strs[i - 1] == b_strs[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + [list_a[i - 1]] + else: + if len(dp[i - 1][j]) > len(dp[i][j - 1]): + dp[i][j] = dp[i - 1][j] + else: + dp[i][j] = dp[i][j - 1] + return dp[m][n] + + +def diff_block_lists(before_blocks: NodeList, after_blocks: NodeList) -> NodeList: + """ + Compares two lists of Pandoc blocks and generates a merged list with annotations. + + This is the core diffing engine. It walks through both lists and the LCS + to identify added and removed blocks. + """ + lcs_nodes = find_longest_common_subsequence(before_blocks, after_blocks) + lcs_set = {json.dumps(node, sort_keys=True) for node in lcs_nodes} + + merged_blocks: NodeList = [] + + ptr_a, ptr_b = 0, 0 + while ptr_a < len(before_blocks) or ptr_b < len(after_blocks): + node_a = before_blocks[ptr_a] if ptr_a < len(before_blocks) else None + node_b = after_blocks[ptr_b] if ptr_b < len(after_blocks) else None + + node_a_str = json.dumps(node_a, sort_keys=True) if node_a else None + node_b_str = json.dumps(node_b, sort_keys=True) if node_b else None + + if node_a and node_a_str not in lcs_set: + # This node from 'before' is not in the LCS, so it was removed. + merged_blocks.append(add_diff_meta(node_a, "removed")) + ptr_a += 1 + elif node_b and node_b_str not in lcs_set: + # This node from 'after' is not in the LCS, so it was added. + merged_blocks.append(add_diff_meta(node_b, "added")) + ptr_b += 1 + elif node_a and node_b: + # Both nodes are present and part of the LCS path. + # Check if they are identical. If not, mark as changed. + if node_a_str != node_b_str: + # The content at this aligned position has changed. + # Mark the 'before' version as removed and 'after' as added. + # This is a common way to show a "change". + merged_blocks.append(add_diff_meta(node_a, "removed")) + merged_blocks.append(add_diff_meta(node_b, "added")) + else: + # The blocks are identical, add them without modification. + merged_blocks.append(node_a) + ptr_a += 1 + ptr_b += 1 + elif ptr_a < len(before_blocks): + # Exhausted 'after_blocks', remaining 'before' blocks are removals. + merged_blocks.append(add_diff_meta(before_blocks[ptr_a], "removed")) + ptr_a += 1 + elif ptr_b < len(after_blocks): + # Exhausted 'before_blocks', remaining 'after' blocks are additions. + merged_blocks.append(add_diff_meta(after_blocks[ptr_b], "added")) + ptr_b += 1 + + return merged_blocks + + +def diff_ast_files(before_path, after_path, output_path): + """Read two Pandoc AST JSON files, diff their blocks, write the result.""" + with open(before_path, "r", encoding="utf-8") as f: + before_ast: PandocAst = json.load(f) + + with open(after_path, "r", encoding="utf-8") as f: + after_ast: PandocAst = json.load(f) + + before_blocks: NodeList = before_ast.get("blocks", []) + after_blocks: NodeList = after_ast.get("blocks", []) + + merged_blocks = diff_block_lists(before_blocks, after_blocks) + + output_ast: PandocAst = { + "pandoc-api-version": after_ast["pandoc-api-version"], + "meta": after_ast["meta"], + "blocks": merged_blocks, + } + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(output_ast, f, indent=2) + + return output_ast diff --git a/doc_build/doc_builder.py b/doc_build/doc_builder.py index 5eb9f8a..cdbcca2 100644 --- a/doc_build/doc_builder.py +++ b/doc_build/doc_builder.py @@ -7,9 +7,12 @@ import os import re import time +import types from pathlib import Path from datetime import datetime -from typing import Dict +from typing import Dict, Optional, Union + +from doc_build.ast_diff import diff_ast_files try: import yaml @@ -20,6 +23,8 @@ sys.exit("Python 3.10 or greater is required.") +MARKDOWN_FORMAT = "markdown-hard_line_breaks" + class _OneOrTwoArgsAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): if values is not None and hasattr(values, "__len__") and len(values) > 2: @@ -89,8 +94,17 @@ def get_output(self, arguments, *args, **kwargs): class DocBuilder: - def __init__(self): + def __init__(self, *, repo_root: Optional[Union[Path, str]] = None): super().__init__() + if repo_root is not None: + self._repo_root = Path(repo_root) + else: + self._repo_root = Path( + git.get_output( + ["rev-parse", "--show-toplevel"], + cwd=self._get_class_file().parent, + ).strip() + ) # MARK: Target Functions def build_docs(self, args): @@ -104,13 +118,16 @@ def build_docs(self, args): elif len(args.diff) > 2: raise ValueError(f"At most 2 arguments for --diff - got {len(args.diff)}") args.output.mkdir(parents=True, exist_ok=True) + self.get_artifacts_dir(args.output).mkdir(parents=True, exist_ok=True) - shutil.copytree( - self.get_specification_root(), - self.get_artifacts_dir(args.output), - dirs_exist_ok=True, - ) - combined = self.preprocess_build(args) + if args.diff: + combined = self.generate_combined_diff( + args, args.diff[0], args.diff[1] + ) + filename = combined.stem + else: + combined = self._setup_and_preprocess(args) + filename = self.get_file_base_name() spec = self.get_metadata_defaults_file() subtitle = self.get_subtitle(spec) @@ -164,7 +181,7 @@ def build_docs(self, args): "2", "--standalone", "--number-sections=true", - "--from=markdown-hard_line_breaks", + "--from", MARKDOWN_FORMAT, "--pdf-engine=tectonic", ] @@ -176,8 +193,6 @@ def build_docs(self, args): docx = None html = None - filename = self.get_file_base_name() - if not args.no_html: html = args.output / f"{filename}.html" html_template = self.get_scripts_root() / "template" / "default.html5" @@ -313,6 +328,73 @@ def flatten(self, args, source, output, substitutions: Dict[str, str] = None): else: out.write(line) + def _setup_and_preprocess(self, args): + """Copy specification into artifacts dir and run preprocess_build. Caller must ensure args.output exists.""" + shutil.copytree( + self.get_specification_root(), + self.get_artifacts_dir(args.output), + dirs_exist_ok=True, + ) + return self.preprocess_build(args) + + def _build_combined_for_ref(self, args, ref, worktree_path, output_subdir): + """Build combined.md for a given ref using a temporary worktree. Removes worktree in finally.""" + worktree_path = Path(worktree_path) + output_dir = Path(args.output) / output_subdir + try: + git( + ["worktree", "add", str(worktree_path), ref], + cwd=self.get_repo_root(), + ) + builder = self.__class__(repo_root=worktree_path) + ref_args = types.SimpleNamespace( + output=output_dir, + no_draft=getattr(args, "no_draft", False), + only=getattr(args, "only", []), + exclude=getattr(args, "exclude", []), + ) + output_dir.mkdir(parents=True, exist_ok=True) + return builder._setup_and_preprocess(ref_args) + finally: + try: + git( + ["worktree", "remove", str(worktree_path)], + cwd=self.get_repo_root(), + ) + except subprocess.CalledProcessError: + pass + + def generate_combined_diff(self, args, from_ref, to_ref): + """Build combined diff markdown from two refs; returns path to diff_X_Y.md.""" + from_short = self.resolve_ref(from_ref, short=True) + to_short = self.resolve_ref(to_ref, short=True) + diff_basename = f"diff_{from_short}_{to_short}" + diff_dir = args.output / "diff" + diff_dir.mkdir(parents=True, exist_ok=True) + worktree_from = diff_dir / "wt_from" + worktree_to = diff_dir / "wt_to" + combined_from = self._build_combined_for_ref( + args, from_ref, worktree_from, "diff_from" + ) + combined_to = self._build_combined_for_ref( + args, to_ref, worktree_to, "diff_to" + ) + ast_from = diff_dir / "ast_from.json" + ast_to = diff_dir / "ast_to.json" + pandoc(["-f", MARKDOWN_FORMAT, "-t", "json", "-o", ast_from, combined_from]) + pandoc(["-f", MARKDOWN_FORMAT, "-t", "json", "-o", ast_to, combined_to]) + diff_ast_path = diff_dir / f"{diff_basename}.json" + diff_ast_files( + str(ast_from), str(ast_to), str(diff_ast_path) + ) + # Not strictly necessary (Pandoc can take JSON as input), but converting + # to markdown unifies the pipeline with the non-diff path and eases debugging. + combined_diff_md = diff_dir / f"{diff_basename}.md" + pandoc( + ["-f", "json", "-t", MARKDOWN_FORMAT, "-o", combined_diff_md, diff_ast_path] + ) + return combined_diff_md + def clean_docs(self, args): if args.output.exists(): shutil.rmtree(args.output) @@ -490,8 +572,7 @@ def get_scripts_root(self) -> Path: return Path(__file__).resolve().parent def get_repo_root(self) -> Path: - """Assumes that the repo root is two up from this root""" - return self._get_class_file().parent.parent + return self._repo_root def get_specification_root(self) -> Path: return self.get_repo_root() / "specification" @@ -512,10 +593,15 @@ def get_entry_point(self, args) -> Path: # MARK: Utility Functions + def resolve_ref(self, ref: str, short: bool = False) -> str: + """Resolve a git ref (branch, tag, hash) to a commit hash in the repo.""" + args = ["rev-parse", "--short", ref] if short else ["rev-parse", ref] + return git.get_output(args, cwd=self.get_repo_root()).strip() + def get_subtitle(self, defaults_file_path: Path): with open(defaults_file_path, "r") as f: spec_data = yaml.load(f, Loader=yaml.SafeLoader) - commit = git.get_output(["rev-parse", "--short", "HEAD"], cwd=self.get_repo_root()).strip() + commit = self.resolve_ref("HEAD", short=True) subtitle = f"v{spec_data['metadata']['version']} ({commit})" return subtitle @@ -604,7 +690,6 @@ def make_build_parser(self, subparsers): build_parser.add_argument( "--no-draft", help="Do not add draft watermark", action="store_true" ) - # TODO: implement support (ie, use 'args.diff') build_parser.add_argument( "--diff", nargs="+", diff --git a/tests/build_scripts/build_docs.py b/tests/build_scripts/build_docs.py index 65c2901..72c3dbb 100644 --- a/tests/build_scripts/build_docs.py +++ b/tests/build_scripts/build_docs.py @@ -1,11 +1,13 @@ #! /usr/bin/env python3 from doc_build.doc_builder import DocBuilder +from pathlib import Path +test_root = Path(__file__).parent.parent class MyDocBuilder(DocBuilder): pass if __name__ == "__main__": - MyDocBuilder().process_argparser() + MyDocBuilder(repo_root=test_root).process_argparser() diff --git a/tools/pandoc_ast_diff.py b/tools/pandoc_ast_diff.py index a52512a..94d3bbd 100644 --- a/tools/pandoc_ast_diff.py +++ b/tools/pandoc_ast_diff.py @@ -8,126 +8,19 @@ The output file contains a merged view, with added or removed nodes annotated with custom metadata. -The core logic uses the Longest Common Subsequence (LCS) algorithm to align -the block-level elements of the two documents. - -- **Added** blocks from the new file are included and marked. -- **Removed** blocks from the old file are included and marked. -- **Changed** blocks are detected by a direct comparison of common elements and - are included (from the new version) with a 'changed' mark. - -Metadata is added by wrapping the target block in a Pandoc 'Div' element -with the attribute `diff=`, where status is one of: -- 'added' -- 'removed' - Usage: python pandoc_ast_diff.py before.json after.json output.json """ -import json import argparse -from typing import List, Dict, Any, Tuple - -PandocNode = Dict[str, Any] -PandocAst = Dict[str, Any] -NodeList = List[PandocNode] - - -def add_diff_meta(node: PandocNode, status: str) -> PandocNode: - """ - Wraps a Pandoc AST node in a Div block to add diff metadata. - - This is the standard Pandoc method for adding block-level attributes. - - Args: - node: The Pandoc node to wrap. - status: The difference status ('added', 'removed'). - - Returns: - A new 'Div' node containing the original node and the diff attribute. - """ - attr: Tuple[str, List[str], List[Tuple[str, str]]] = ("", [], [("diff", status)]) - - return {"t": "Div", "c": [attr, [node]]} - - -def find_longest_common_subsequence(list_a: NodeList, list_b: NodeList) -> NodeList: - """ - Computes the Longest Common Subsequence (LCS) of two lists of nodes. - - This uses a classic dynamic programming approach. The nodes are compared - for deep equality. - """ - m, n = len(list_a), len(list_b) - dp = [[[] for _ in range(n + 1)] for _ in range(m + 1)] +import sys +from pathlib import Path - for i in range(1, m + 1): - for j in range(1, n + 1): - if json.dumps(list_a[i - 1], sort_keys=True) == json.dumps( - list_b[j - 1], sort_keys=True - ): - dp[i][j] = dp[i - 1][j - 1] + [list_a[i - 1]] - else: - if len(dp[i - 1][j]) > len(dp[i][j - 1]): - dp[i][j] = dp[i - 1][j] - else: - dp[i][j] = dp[i][j - 1] - return dp[m][n] - - -def diff_block_lists(before_blocks: NodeList, after_blocks: NodeList) -> NodeList: - """ - Compares two lists of Pandoc blocks and generates a merged list with annotations. - - This is the core diffing engine. It walks through both lists and the LCS - to identify added and removed blocks. - """ - lcs_nodes = find_longest_common_subsequence(before_blocks, after_blocks) - lcs_set = {json.dumps(node, sort_keys=True) for node in lcs_nodes} - - merged_blocks: NodeList = [] - - ptr_a, ptr_b = 0, 0 - while ptr_a < len(before_blocks) or ptr_b < len(after_blocks): - node_a = before_blocks[ptr_a] if ptr_a < len(before_blocks) else None - node_b = after_blocks[ptr_b] if ptr_b < len(after_blocks) else None - - node_a_str = json.dumps(node_a, sort_keys=True) if node_a else None - node_b_str = json.dumps(node_b, sort_keys=True) if node_b else None - - if node_a and node_a_str not in lcs_set: - # This node from 'before' is not in the LCS, so it was removed. - merged_blocks.append(add_diff_meta(node_a, "removed")) - ptr_a += 1 - elif node_b and node_b_str not in lcs_set: - # This node from 'after' is not in the LCS, so it was added. - merged_blocks.append(add_diff_meta(node_b, "added")) - ptr_b += 1 - elif node_a and node_b: - # Both nodes are present and part of the LCS path. - # Check if they are identical. If not, mark as changed. - if node_a_str != node_b_str: - # The content at this aligned position has changed. - # Mark the 'before' version as removed and 'after' as added. - # This is a common way to show a "change". - merged_blocks.append(add_diff_meta(node_a, "removed")) - merged_blocks.append(add_diff_meta(node_b, "added")) - else: - # The blocks are identical, add them without modification. - merged_blocks.append(node_a) - ptr_a += 1 - ptr_b += 1 - elif ptr_a < len(before_blocks): - # Exhausted 'after_blocks', remaining 'before' blocks are removals. - merged_blocks.append(add_diff_meta(before_blocks[ptr_a], "removed")) - ptr_a += 1 - elif ptr_b < len(after_blocks): - # Exhausted 'before_blocks', remaining 'after' blocks are additions. - merged_blocks.append(add_diff_meta(after_blocks[ptr_b], "added")) - ptr_b += 1 - - return merged_blocks +try: + from doc_build.ast_diff import diff_ast_files +except ImportError: + sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + from doc_build.ast_diff import diff_ast_files if __name__ == "__main__": @@ -140,22 +33,5 @@ def diff_block_lists(before_blocks: NodeList, after_blocks: NodeList) -> NodeLis args = parser.parse_args() - with open(args.before_file, "r", encoding="utf-8") as f: - before_ast: PandocAst = json.load(f) - - with open(args.after_file, "r", encoding="utf-8") as f: - after_ast: PandocAst = json.load(f) - - before_blocks: NodeList = before_ast.get("blocks", []) - after_blocks: NodeList = after_ast.get("blocks", []) - - merged_blocks = diff_block_lists(before_blocks, after_blocks) - - output_ast: PandocAst = { - "pandoc-api-version": after_ast["pandoc-api-version"], - "meta": after_ast["meta"], - "blocks": merged_blocks, - } + diff_ast_files(args.before_file, args.after_file, args.output_file) - with open(args.output_file, "w", encoding="utf-8") as f: - json.dump(output_ast, f, indent=2)