aousd · pmolodo · Mar 11, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/README.md b/README.md
@@ -92,6 +92,7 @@ The following subcommands are available:
     * `--clean`: Runs the cleanup subcommand before running
     * `--only`/`--exclude`: Limits which sections get inlined during processing
     * `--no-draft`: Turns off the draft waterman on the PDF
+    * `--diff from_ref [to_ref]`: Build a document showing changes between two Git refs (e.g. commits, branches, or tags). If `to_ref` is omitted it defaults to `HEAD`. The build uses temporary worktrees to produce combined markdown for each ref, diffs the Pandoc ASTs, then runs the usual pipeline on the annotated diff; output files are named like `diff_<short_from>_<short_to>.pdf`.
 * `clean`: Cleans any build artifacts.
 * `lint`: Lints the build output for common issues.
 * `export`: Exports the git archive to a zip for sharing.
@@ -103,6 +104,17 @@ The following subcommands are available:
 Only the `build` subcommand is routinely tested and supported. The others are convenience
 methods and may or may not work.
 
+#### Diff workflow (`--diff`)
+
+When `--diff from_ref [to_ref]` is used, the builder does not build from the current working tree. Instead it:
+
+1. Creates temporary Git worktrees at `from_ref` and `to_ref` (defaulting `to_ref` to `HEAD`).
+2. For each ref, runs the usual preprocessing (flatten specification into a single combined markdown file).
+3. Converts each combined markdown to a Pandoc JSON AST, diffs the two ASTs to produce an annotated diff (added/removed blocks), and converts the diff AST back to markdown.
+4. Runs the normal Pandoc pipeline (filters, PDF/HTML/DOCX) on that combined diff markdown.
+
+Outputs are written under the normal build output directory with base name `diff_<short_from>_<short_to>` so they do not overwrite a regular build.
+
 ### Dependencies
 
 Dependencies are automatically installed by [pixi](https://pixi.sh), and should work on macOS/Linux/Windows.

diff --git a/doc_build/ast_diff.py b/doc_build/ast_diff.py
@@ -0,0 +1,144 @@
+"""Pandoc AST differencing. Core logic for the Pandoc AST Differencing Tool.
+
+The core logic uses the Longest Common Subsequence (LCS) algorithm to align
+the block-level elements of the two documents.
+
+- **Added** blocks from the new file are included and marked.
+- **Removed** blocks from the old file are included and marked.
+- **Changed** blocks are detected by a direct comparison of common elements and
+  are included (from the new version) with a 'changed' mark.
+
+Metadata is added by wrapping the target block in a Pandoc 'Div' element
+with the attribute `diff=<status>`, where status is one of:
+- 'added'
+- 'removed'
+"""
+
+import json
+
+from typing import List, Dict, Any, Tuple
+
+PandocNode = Dict[str, Any]
+PandocAst = Dict[str, Any]
+NodeList = List[PandocNode]
+
+
+def add_diff_meta(node: PandocNode, status: str) -> PandocNode:
+    """
+    Wraps a Pandoc AST node in a Div block to add diff metadata.
+
+    This is the standard Pandoc method for adding block-level attributes.
+
+    Args:
+        node: The Pandoc node to wrap.
+        status: The difference status ('added', 'removed').
+
+    Returns:
+        A new 'Div' node containing the original node and the diff attribute.
+    """
+    attr: Tuple[str, List[str], List[Tuple[str, str]]] = ("", [], [("diff", status)])
+
+    return {"t": "Div", "c": [attr, [node]]}
+
+
+def find_longest_common_subsequence(list_a: NodeList, list_b: NodeList) -> NodeList:
+    """
+    Computes the Longest Common Subsequence (LCS) of two lists of nodes.
+
+    This uses a classic dynamic programming approach. The nodes are compared
+    for deep equality.
+    """
+    m, n = len(list_a), len(list_b)
+    dp = [[[] for _ in range(n + 1)] for _ in range(m + 1)]
+    a_strs = [json.dumps(node, sort_keys=True) for node in list_a]
+    b_strs = [json.dumps(node, sort_keys=True) for node in list_b]
+
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if a_strs[i - 1] == b_strs[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + [list_a[i - 1]]
+            else:
+                if len(dp[i - 1][j]) > len(dp[i][j - 1]):
+                    dp[i][j] = dp[i - 1][j]
+                else:
+                    dp[i][j] = dp[i][j - 1]
+    return dp[m][n]
+
+
+def diff_block_lists(before_blocks: NodeList, after_blocks: NodeList) -> NodeList:
+    """
+    Compares two lists of Pandoc blocks and generates a merged list with annotations.
+
+    This is the core diffing engine. It walks through both lists and the LCS
+    to identify added and removed blocks.
+    """
+    lcs_nodes = find_longest_common_subsequence(before_blocks, after_blocks)
+    lcs_set = {json.dumps(node, sort_keys=True) for node in lcs_nodes}
+
+    merged_blocks: NodeList = []
+
+    ptr_a, ptr_b = 0, 0
+    while ptr_a < len(before_blocks) or ptr_b < len(after_blocks):
+        node_a = before_blocks[ptr_a] if ptr_a < len(before_blocks) else None
+        node_b = after_blocks[ptr_b] if ptr_b < len(after_blocks) else None
+
+        node_a_str = json.dumps(node_a, sort_keys=True) if node_a else None
+        node_b_str = json.dumps(node_b, sort_keys=True) if node_b else None
+
+        if node_a and node_a_str not in lcs_set:
+            # This node from 'before' is not in the LCS, so it was removed.
+            merged_blocks.append(add_diff_meta(node_a, "removed"))
+            ptr_a += 1
+        elif node_b and node_b_str not in lcs_set:
+            # This node from 'after' is not in the LCS, so it was added.
+            merged_blocks.append(add_diff_meta(node_b, "added"))
+            ptr_b += 1
+        elif node_a and node_b:
+            # Both nodes are present and part of the LCS path.
+            # Check if they are identical. If not, mark as changed.
+            if node_a_str != node_b_str:
+                # The content at this aligned position has changed.
+                # Mark the 'before' version as removed and 'after' as added.
+                # This is a common way to show a "change".
+                merged_blocks.append(add_diff_meta(node_a, "removed"))
+                merged_blocks.append(add_diff_meta(node_b, "added"))
+            else:
+                # The blocks are identical, add them without modification.
+                merged_blocks.append(node_a)
+            ptr_a += 1
+            ptr_b += 1
+        elif ptr_a < len(before_blocks):
+            # Exhausted 'after_blocks', remaining 'before' blocks are removals.
+            merged_blocks.append(add_diff_meta(before_blocks[ptr_a], "removed"))
+            ptr_a += 1
+        elif ptr_b < len(after_blocks):
+            # Exhausted 'before_blocks', remaining 'after' blocks are additions.
+            merged_blocks.append(add_diff_meta(after_blocks[ptr_b], "added"))
+            ptr_b += 1
+
+    return merged_blocks
+
+
+def diff_ast_files(before_path, after_path, output_path):
+    """Read two Pandoc AST JSON files, diff their blocks, write the result."""
+    with open(before_path, "r", encoding="utf-8") as f:
+        before_ast: PandocAst = json.load(f)
+
+    with open(after_path, "r", encoding="utf-8") as f:
+        after_ast: PandocAst = json.load(f)
+
+    before_blocks: NodeList = before_ast.get("blocks", [])
+    after_blocks: NodeList = after_ast.get("blocks", [])
+
+    merged_blocks = diff_block_lists(before_blocks, after_blocks)
+
+    output_ast: PandocAst = {
+        "pandoc-api-version": after_ast["pandoc-api-version"],
+        "meta": after_ast["meta"],
+        "blocks": merged_blocks,
+    }
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(output_ast, f, indent=2)
+
+    return output_ast
diff --git a/doc_build/doc_builder.py b/doc_build/doc_builder.py
@@ -7,9 +7,12 @@
 import os
 import re
 import time
+import types
 from pathlib import Path
 from datetime import datetime
-from typing import Dict
+from typing import Dict, Optional, Union
+
+from doc_build.ast_diff import diff_ast_files
 
 try:
     import yaml
@@ -20,6 +23,8 @@
     sys.exit("Python 3.10 or greater is required.")
 
 
+MARKDOWN_FORMAT = "markdown-hard_line_breaks"
+
 class _OneOrTwoArgsAction(argparse.Action):
     def __call__(self, parser, namespace, values, option_string=None):
         if values is not None and hasattr(values, "__len__") and len(values) > 2:
@@ -89,8 +94,17 @@ def get_output(self, arguments, *args, **kwargs):
 
 
 class DocBuilder:
-    def __init__(self):
+    def __init__(self, *, repo_root: Optional[Union[Path, str]] = None):
         super().__init__()
+        if repo_root is not None:
+            self._repo_root = Path(repo_root)
+        else:
+            self._repo_root = Path(
+                git.get_output(
+                    ["rev-parse", "--show-toplevel"],
+                    cwd=self._get_class_file().parent,
+                ).strip()
+            )
 
     # MARK: Target Functions
     def build_docs(self, args):
@@ -104,13 +118,16 @@ def build_docs(self, args):
             elif len(args.diff) > 2:
                 raise ValueError(f"At most 2 arguments for --diff - got {len(args.diff)}")
         args.output.mkdir(parents=True, exist_ok=True)
+        self.get_artifacts_dir(args.output).mkdir(parents=True, exist_ok=True)
 
-        shutil.copytree(
-            self.get_specification_root(),
-            self.get_artifacts_dir(args.output),
-            dirs_exist_ok=True,
-        )
-        combined = self.preprocess_build(args)
+        if args.diff:
+            combined = self.generate_combined_diff(
+                args, args.diff[0], args.diff[1]
+            )
+            filename = combined.stem
+        else:
+            combined = self._setup_and_preprocess(args)
+            filename = self.get_file_base_name()
 
         spec = self.get_metadata_defaults_file()
         subtitle = self.get_subtitle(spec)
@@ -164,7 +181,7 @@ def build_docs(self, args):
             "2",
             "--standalone",
             "--number-sections=true",
-            "--from=markdown-hard_line_breaks",
+            "--from", MARKDOWN_FORMAT,
             "--pdf-engine=tectonic",
         ]
 
@@ -176,8 +193,6 @@ def build_docs(self, args):
         docx = None
         html = None
 
-        filename = self.get_file_base_name()
-
         if not args.no_html:
             html = args.output / f"{filename}.html"
             html_template = self.get_scripts_root() / "template" / "default.html5"
@@ -313,6 +328,73 @@ def flatten(self, args, source, output, substitutions: Dict[str, str] = None):
                     else:
                         out.write(line)
 
+    def _setup_and_preprocess(self, args):
+        """Copy specification into artifacts dir and run preprocess_build. Caller must ensure args.output exists."""
+        shutil.copytree(
+            self.get_specification_root(),
+            self.get_artifacts_dir(args.output),
+            dirs_exist_ok=True,
+        )
+        return self.preprocess_build(args)
+
+    def _build_combined_for_ref(self, args, ref, worktree_path, output_subdir):
+        """Build combined.md for a given ref using a temporary worktree. Removes worktree in finally."""
+        worktree_path = Path(worktree_path)
+        output_dir = Path(args.output) / output_subdir
+        try:
+            git(
+                ["worktree", "add", str(worktree_path), ref],
+                cwd=self.get_repo_root(),
+            )
+            builder = self.__class__(repo_root=worktree_path)
+            ref_args = types.SimpleNamespace(
+                output=output_dir,
+                no_draft=getattr(args, "no_draft", False),
+                only=getattr(args, "only", []),
+                exclude=getattr(args, "exclude", []),
+            )
+            output_dir.mkdir(parents=True, exist_ok=True)
+            return builder._setup_and_preprocess(ref_args)
+        finally:
+            try:
+                git(
+                    ["worktree", "remove", str(worktree_path)],
+                    cwd=self.get_repo_root(),
+                )
+            except subprocess.CalledProcessError:
+                pass
+
+    def generate_combined_diff(self, args, from_ref, to_ref):
+        """Build combined diff markdown from two refs; returns path to diff_X_Y.md."""
+        from_short = self.resolve_ref(from_ref, short=True)
+        to_short = self.resolve_ref(to_ref, short=True)
+        diff_basename = f"diff_{from_short}_{to_short}"
+        diff_dir = args.output / "diff"
+        diff_dir.mkdir(parents=True, exist_ok=True)
+        worktree_from = diff_dir / "wt_from"
+        worktree_to = diff_dir / "wt_to"
+        combined_from = self._build_combined_for_ref(
+            args, from_ref, worktree_from, "diff_from"
+        )
+        combined_to = self._build_combined_for_ref(
+            args, to_ref, worktree_to, "diff_to"
+        )
+        ast_from = diff_dir / "ast_from.json"
+        ast_to = diff_dir / "ast_to.json"
+        pandoc(["-f", MARKDOWN_FORMAT, "-t", "json", "-o", ast_from, combined_from])
+        pandoc(["-f", MARKDOWN_FORMAT, "-t", "json", "-o", ast_to, combined_to])
+        diff_ast_path = diff_dir / f"{diff_basename}.json"
+        diff_ast_files(
+            str(ast_from), str(ast_to), str(diff_ast_path)
+        )
+        # Not strictly necessary (Pandoc can take JSON as input), but converting
+        # to markdown unifies the pipeline with the non-diff path and eases debugging.
+        combined_diff_md = diff_dir / f"{diff_basename}.md"
+        pandoc(
+            ["-f", "json", "-t", MARKDOWN_FORMAT, "-o", combined_diff_md, diff_ast_path]
+        )
+        return combined_diff_md
+
     def clean_docs(self, args):
         if args.output.exists():
             shutil.rmtree(args.output)
@@ -490,8 +572,7 @@ def get_scripts_root(self) -> Path:
         return Path(__file__).resolve().parent
 
     def get_repo_root(self) -> Path:
-        """Assumes that the repo root is two up from this root"""
-        return self._get_class_file().parent.parent
+        return self._repo_root
 
     def get_specification_root(self) -> Path:
         return self.get_repo_root() / "specification"
@@ -512,10 +593,15 @@ def get_entry_point(self, args) -> Path:
 
     # MARK: Utility Functions
 
+    def resolve_ref(self, ref: str, short: bool = False) -> str:
+        """Resolve a git ref (branch, tag, hash) to a commit hash in the repo."""
+        args = ["rev-parse", "--short", ref] if short else ["rev-parse", ref]
+        return git.get_output(args, cwd=self.get_repo_root()).strip()
+
     def get_subtitle(self, defaults_file_path: Path):
         with open(defaults_file_path, "r") as f:
             spec_data = yaml.load(f, Loader=yaml.SafeLoader)
-            commit = git.get_output(["rev-parse", "--short", "HEAD"], cwd=self.get_repo_root()).strip()
+            commit = self.resolve_ref("HEAD", short=True)
             subtitle = f"v{spec_data['metadata']['version']} ({commit})"
         return subtitle
 
@@ -604,7 +690,6 @@ def make_build_parser(self, subparsers):
         build_parser.add_argument(
             "--no-draft", help="Do not add draft watermark", action="store_true"
         )
-        # TODO: implement support (ie, use 'args.diff')
         build_parser.add_argument(
             "--diff",
             nargs="+",

diff --git a/tests/build_scripts/build_docs.py b/tests/build_scripts/build_docs.py
@@ -1,11 +1,13 @@
 #! /usr/bin/env python3
 
 from doc_build.doc_builder import DocBuilder
+from pathlib import Path
 
+test_root = Path(__file__).parent.parent
 
 class MyDocBuilder(DocBuilder):
     pass
 
 
 if __name__ == "__main__":
-    MyDocBuilder().process_argparser()
+    MyDocBuilder(repo_root=test_root).process_argparser()