Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ The following subcommands are available:
* `--clean`: Runs the cleanup subcommand before running
* `--only`/`--exclude`: Limits which sections get inlined during processing
* `--no-draft`: Turns off the draft waterman on the PDF
* `--diff from_ref [to_ref]`: Build a document showing changes between two Git refs (e.g. commits, branches, or tags). If `to_ref` is omitted it defaults to `HEAD`. The build uses temporary worktrees to produce combined markdown for each ref, diffs the Pandoc ASTs, then runs the usual pipeline on the annotated diff; output files are named like `diff_<short_from>_<short_to>.pdf`.
* `clean`: Cleans any build artifacts.
* `lint`: Lints the build output for common issues.
* `export`: Exports the git archive to a zip for sharing.
Expand All @@ -103,6 +104,17 @@ The following subcommands are available:
Only the `build` subcommand is routinely tested and supported. The others are convenience
methods and may or may not work.

#### Diff workflow (`--diff`)

When `--diff from_ref [to_ref]` is used, the builder does not build from the current working tree. Instead it:

1. Creates temporary Git worktrees at `from_ref` and `to_ref` (defaulting `to_ref` to `HEAD`).
2. For each ref, runs the usual preprocessing (flatten specification into a single combined markdown file).
3. Converts each combined markdown to a Pandoc JSON AST, diffs the two ASTs to produce an annotated diff (added/removed blocks), and converts the diff AST back to markdown.
4. Runs the normal Pandoc pipeline (filters, PDF/HTML/DOCX) on that combined diff markdown.

Outputs are written under the normal build output directory with base name `diff_<short_from>_<short_to>` so they do not overwrite a regular build.

### Dependencies

Dependencies are automatically installed by [pixi](https://pixi.sh), and should work on macOS/Linux/Windows.
Expand Down
144 changes: 144 additions & 0 deletions doc_build/ast_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""Pandoc AST differencing. Core logic for the Pandoc AST Differencing Tool.

The core logic uses the Longest Common Subsequence (LCS) algorithm to align
the block-level elements of the two documents.

- **Added** blocks from the new file are included and marked.
- **Removed** blocks from the old file are included and marked.
- **Changed** blocks are detected by a direct comparison of common elements and
are included (from the new version) with a 'changed' mark.

Metadata is added by wrapping the target block in a Pandoc 'Div' element
with the attribute `diff=<status>`, where status is one of:
- 'added'
- 'removed'
"""

import json

from typing import List, Dict, Any, Tuple

PandocNode = Dict[str, Any]
PandocAst = Dict[str, Any]
NodeList = List[PandocNode]


def add_diff_meta(node: PandocNode, status: str) -> PandocNode:
"""
Wraps a Pandoc AST node in a Div block to add diff metadata.

This is the standard Pandoc method for adding block-level attributes.

Args:
node: The Pandoc node to wrap.
status: The difference status ('added', 'removed').

Returns:
A new 'Div' node containing the original node and the diff attribute.
"""
attr: Tuple[str, List[str], List[Tuple[str, str]]] = ("", [], [("diff", status)])

return {"t": "Div", "c": [attr, [node]]}


def find_longest_common_subsequence(list_a: NodeList, list_b: NodeList) -> NodeList:
"""
Computes the Longest Common Subsequence (LCS) of two lists of nodes.

This uses a classic dynamic programming approach. The nodes are compared
for deep equality.
"""
m, n = len(list_a), len(list_b)
dp = [[[] for _ in range(n + 1)] for _ in range(m + 1)]
a_strs = [json.dumps(node, sort_keys=True) for node in list_a]
b_strs = [json.dumps(node, sort_keys=True) for node in list_b]

for i in range(1, m + 1):
for j in range(1, n + 1):
if a_strs[i - 1] == b_strs[j - 1]:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good find, thanks! :)

dp[i][j] = dp[i - 1][j - 1] + [list_a[i - 1]]
else:
if len(dp[i - 1][j]) > len(dp[i][j - 1]):
dp[i][j] = dp[i - 1][j]
else:
dp[i][j] = dp[i][j - 1]
return dp[m][n]


def diff_block_lists(before_blocks: NodeList, after_blocks: NodeList) -> NodeList:
"""
Compares two lists of Pandoc blocks and generates a merged list with annotations.

This is the core diffing engine. It walks through both lists and the LCS
to identify added and removed blocks.
"""
lcs_nodes = find_longest_common_subsequence(before_blocks, after_blocks)
lcs_set = {json.dumps(node, sort_keys=True) for node in lcs_nodes}

merged_blocks: NodeList = []

ptr_a, ptr_b = 0, 0
while ptr_a < len(before_blocks) or ptr_b < len(after_blocks):
node_a = before_blocks[ptr_a] if ptr_a < len(before_blocks) else None
node_b = after_blocks[ptr_b] if ptr_b < len(after_blocks) else None

node_a_str = json.dumps(node_a, sort_keys=True) if node_a else None
node_b_str = json.dumps(node_b, sort_keys=True) if node_b else None

if node_a and node_a_str not in lcs_set:
# This node from 'before' is not in the LCS, so it was removed.
merged_blocks.append(add_diff_meta(node_a, "removed"))
ptr_a += 1
elif node_b and node_b_str not in lcs_set:
# This node from 'after' is not in the LCS, so it was added.
merged_blocks.append(add_diff_meta(node_b, "added"))
ptr_b += 1
elif node_a and node_b:
# Both nodes are present and part of the LCS path.
# Check if they are identical. If not, mark as changed.
if node_a_str != node_b_str:
# The content at this aligned position has changed.
# Mark the 'before' version as removed and 'after' as added.
# This is a common way to show a "change".
merged_blocks.append(add_diff_meta(node_a, "removed"))
merged_blocks.append(add_diff_meta(node_b, "added"))
else:
# The blocks are identical, add them without modification.
merged_blocks.append(node_a)
ptr_a += 1
ptr_b += 1
elif ptr_a < len(before_blocks):
# Exhausted 'after_blocks', remaining 'before' blocks are removals.
merged_blocks.append(add_diff_meta(before_blocks[ptr_a], "removed"))
ptr_a += 1
elif ptr_b < len(after_blocks):
# Exhausted 'before_blocks', remaining 'after' blocks are additions.
merged_blocks.append(add_diff_meta(after_blocks[ptr_b], "added"))
ptr_b += 1

return merged_blocks


def diff_ast_files(before_path, after_path, output_path):
"""Read two Pandoc AST JSON files, diff their blocks, write the result."""
with open(before_path, "r", encoding="utf-8") as f:
before_ast: PandocAst = json.load(f)

with open(after_path, "r", encoding="utf-8") as f:
after_ast: PandocAst = json.load(f)

before_blocks: NodeList = before_ast.get("blocks", [])
after_blocks: NodeList = after_ast.get("blocks", [])

merged_blocks = diff_block_lists(before_blocks, after_blocks)

output_ast: PandocAst = {
"pandoc-api-version": after_ast["pandoc-api-version"],
"meta": after_ast["meta"],
"blocks": merged_blocks,
}

with open(output_path, "w", encoding="utf-8") as f:
json.dump(output_ast, f, indent=2)

return output_ast
115 changes: 100 additions & 15 deletions doc_build/doc_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@
import os
import re
import time
import types
from pathlib import Path
from datetime import datetime
from typing import Dict
from typing import Dict, Optional, Union

from doc_build.ast_diff import diff_ast_files

try:
import yaml
Expand All @@ -20,6 +23,8 @@
sys.exit("Python 3.10 or greater is required.")


MARKDOWN_FORMAT = "markdown-hard_line_breaks"

class _OneOrTwoArgsAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
if values is not None and hasattr(values, "__len__") and len(values) > 2:
Expand Down Expand Up @@ -89,8 +94,17 @@ def get_output(self, arguments, *args, **kwargs):


class DocBuilder:
def __init__(self):
def __init__(self, *, repo_root: Optional[Union[Path, str]] = None):
super().__init__()
if repo_root is not None:
self._repo_root = Path(repo_root)
else:
self._repo_root = Path(
git.get_output(
["rev-parse", "--show-toplevel"],
cwd=self._get_class_file().parent,
).strip()
)

# MARK: Target Functions
def build_docs(self, args):
Expand All @@ -104,13 +118,16 @@ def build_docs(self, args):
elif len(args.diff) > 2:
raise ValueError(f"At most 2 arguments for --diff - got {len(args.diff)}")
args.output.mkdir(parents=True, exist_ok=True)
self.get_artifacts_dir(args.output).mkdir(parents=True, exist_ok=True)

shutil.copytree(
self.get_specification_root(),
self.get_artifacts_dir(args.output),
dirs_exist_ok=True,
)
combined = self.preprocess_build(args)
if args.diff:
combined = self.generate_combined_diff(
args, args.diff[0], args.diff[1]
)
filename = combined.stem
else:
combined = self._setup_and_preprocess(args)
filename = self.get_file_base_name()

spec = self.get_metadata_defaults_file()
subtitle = self.get_subtitle(spec)
Expand Down Expand Up @@ -164,7 +181,7 @@ def build_docs(self, args):
"2",
"--standalone",
"--number-sections=true",
"--from=markdown-hard_line_breaks",
"--from", MARKDOWN_FORMAT,
"--pdf-engine=tectonic",
]

Expand All @@ -176,8 +193,6 @@ def build_docs(self, args):
docx = None
html = None

filename = self.get_file_base_name()

if not args.no_html:
html = args.output / f"{filename}.html"
html_template = self.get_scripts_root() / "template" / "default.html5"
Expand Down Expand Up @@ -313,6 +328,73 @@ def flatten(self, args, source, output, substitutions: Dict[str, str] = None):
else:
out.write(line)

def _setup_and_preprocess(self, args):
"""Copy specification into artifacts dir and run preprocess_build. Caller must ensure args.output exists."""
shutil.copytree(
self.get_specification_root(),
self.get_artifacts_dir(args.output),
dirs_exist_ok=True,
)
return self.preprocess_build(args)

def _build_combined_for_ref(self, args, ref, worktree_path, output_subdir):
"""Build combined.md for a given ref using a temporary worktree. Removes worktree in finally."""
worktree_path = Path(worktree_path)
output_dir = Path(args.output) / output_subdir
try:
git(
["worktree", "add", str(worktree_path), ref],
cwd=self.get_repo_root(),
)
builder = self.__class__(repo_root=worktree_path)
ref_args = types.SimpleNamespace(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is cool :)

output=output_dir,
no_draft=getattr(args, "no_draft", False),
only=getattr(args, "only", []),
exclude=getattr(args, "exclude", []),
)
output_dir.mkdir(parents=True, exist_ok=True)
return builder._setup_and_preprocess(ref_args)
finally:
try:
git(
["worktree", "remove", str(worktree_path)],
cwd=self.get_repo_root(),
)
except subprocess.CalledProcessError:
pass

def generate_combined_diff(self, args, from_ref, to_ref):
"""Build combined diff markdown from two refs; returns path to diff_X_Y.md."""
from_short = self.resolve_ref(from_ref, short=True)
to_short = self.resolve_ref(to_ref, short=True)
diff_basename = f"diff_{from_short}_{to_short}"
diff_dir = args.output / "diff"
diff_dir.mkdir(parents=True, exist_ok=True)
worktree_from = diff_dir / "wt_from"
worktree_to = diff_dir / "wt_to"
combined_from = self._build_combined_for_ref(
args, from_ref, worktree_from, "diff_from"
)
combined_to = self._build_combined_for_ref(
args, to_ref, worktree_to, "diff_to"
)
ast_from = diff_dir / "ast_from.json"
ast_to = diff_dir / "ast_to.json"
pandoc(["-f", MARKDOWN_FORMAT, "-t", "json", "-o", ast_from, combined_from])
pandoc(["-f", MARKDOWN_FORMAT, "-t", "json", "-o", ast_to, combined_to])
diff_ast_path = diff_dir / f"{diff_basename}.json"
diff_ast_files(
str(ast_from), str(ast_to), str(diff_ast_path)
)
# Not strictly necessary (Pandoc can take JSON as input), but converting
# to markdown unifies the pipeline with the non-diff path and eases debugging.
combined_diff_md = diff_dir / f"{diff_basename}.md"
pandoc(
["-f", "json", "-t", MARKDOWN_FORMAT, "-o", combined_diff_md, diff_ast_path]
)
return combined_diff_md

def clean_docs(self, args):
if args.output.exists():
shutil.rmtree(args.output)
Expand Down Expand Up @@ -490,8 +572,7 @@ def get_scripts_root(self) -> Path:
return Path(__file__).resolve().parent

def get_repo_root(self) -> Path:
"""Assumes that the repo root is two up from this root"""
return self._get_class_file().parent.parent
return self._repo_root

def get_specification_root(self) -> Path:
return self.get_repo_root() / "specification"
Expand All @@ -512,10 +593,15 @@ def get_entry_point(self, args) -> Path:

# MARK: Utility Functions

def resolve_ref(self, ref: str, short: bool = False) -> str:
"""Resolve a git ref (branch, tag, hash) to a commit hash in the repo."""
args = ["rev-parse", "--short", ref] if short else ["rev-parse", ref]
return git.get_output(args, cwd=self.get_repo_root()).strip()

def get_subtitle(self, defaults_file_path: Path):
with open(defaults_file_path, "r") as f:
spec_data = yaml.load(f, Loader=yaml.SafeLoader)
commit = git.get_output(["rev-parse", "--short", "HEAD"], cwd=self.get_repo_root()).strip()
commit = self.resolve_ref("HEAD", short=True)
subtitle = f"v{spec_data['metadata']['version']} ({commit})"
return subtitle

Expand Down Expand Up @@ -604,7 +690,6 @@ def make_build_parser(self, subparsers):
build_parser.add_argument(
"--no-draft", help="Do not add draft watermark", action="store_true"
)
# TODO: implement support (ie, use 'args.diff')
build_parser.add_argument(
"--diff",
nargs="+",
Expand Down
4 changes: 3 additions & 1 deletion tests/build_scripts/build_docs.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#! /usr/bin/env python3

from doc_build.doc_builder import DocBuilder
from pathlib import Path

test_root = Path(__file__).parent.parent

class MyDocBuilder(DocBuilder):
pass


if __name__ == "__main__":
MyDocBuilder().process_argparser()
MyDocBuilder(repo_root=test_root).process_argparser()
Loading