-
Notifications
You must be signed in to change notification settings - Fork 2
implemement diff generation as build CLI arg (all commits) #39
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
93af8f6
move core logic from tools/pandoc_ast_diff.py to doc_build/ast_diff.py
pmolodo ff5bc84
doc_builder: add ability to specify a repo_root
pmolodo d47089c
get_repo_root: don't make assumptions about repository structure
pmolodo 42017ce
factor out resolve_ref and _setup_and_preprocess methods
pmolodo 3f23ed9
implement diff
pmolodo fa06ad4
speed up diff LCS computation
pmolodo File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,144 @@ | ||
| """Pandoc AST differencing. Core logic for the Pandoc AST Differencing Tool. | ||
|
|
||
| The core logic uses the Longest Common Subsequence (LCS) algorithm to align | ||
| the block-level elements of the two documents. | ||
|
|
||
| - **Added** blocks from the new file are included and marked. | ||
| - **Removed** blocks from the old file are included and marked. | ||
| - **Changed** blocks are detected by a direct comparison of common elements and | ||
| are included (from the new version) with a 'changed' mark. | ||
|
|
||
| Metadata is added by wrapping the target block in a Pandoc 'Div' element | ||
| with the attribute `diff=<status>`, where status is one of: | ||
| - 'added' | ||
| - 'removed' | ||
| """ | ||
|
|
||
| import json | ||
|
|
||
| from typing import List, Dict, Any, Tuple | ||
|
|
||
| PandocNode = Dict[str, Any] | ||
| PandocAst = Dict[str, Any] | ||
| NodeList = List[PandocNode] | ||
|
|
||
|
|
||
| def add_diff_meta(node: PandocNode, status: str) -> PandocNode: | ||
| """ | ||
| Wraps a Pandoc AST node in a Div block to add diff metadata. | ||
|
|
||
| This is the standard Pandoc method for adding block-level attributes. | ||
|
|
||
| Args: | ||
| node: The Pandoc node to wrap. | ||
| status: The difference status ('added', 'removed'). | ||
|
|
||
| Returns: | ||
| A new 'Div' node containing the original node and the diff attribute. | ||
| """ | ||
| attr: Tuple[str, List[str], List[Tuple[str, str]]] = ("", [], [("diff", status)]) | ||
|
|
||
| return {"t": "Div", "c": [attr, [node]]} | ||
|
|
||
|
|
||
| def find_longest_common_subsequence(list_a: NodeList, list_b: NodeList) -> NodeList: | ||
| """ | ||
| Computes the Longest Common Subsequence (LCS) of two lists of nodes. | ||
|
|
||
| This uses a classic dynamic programming approach. The nodes are compared | ||
| for deep equality. | ||
| """ | ||
| m, n = len(list_a), len(list_b) | ||
| dp = [[[] for _ in range(n + 1)] for _ in range(m + 1)] | ||
| a_strs = [json.dumps(node, sort_keys=True) for node in list_a] | ||
| b_strs = [json.dumps(node, sort_keys=True) for node in list_b] | ||
|
|
||
| for i in range(1, m + 1): | ||
| for j in range(1, n + 1): | ||
| if a_strs[i - 1] == b_strs[j - 1]: | ||
| dp[i][j] = dp[i - 1][j - 1] + [list_a[i - 1]] | ||
| else: | ||
| if len(dp[i - 1][j]) > len(dp[i][j - 1]): | ||
| dp[i][j] = dp[i - 1][j] | ||
| else: | ||
| dp[i][j] = dp[i][j - 1] | ||
| return dp[m][n] | ||
|
|
||
|
|
||
| def diff_block_lists(before_blocks: NodeList, after_blocks: NodeList) -> NodeList: | ||
| """ | ||
| Compares two lists of Pandoc blocks and generates a merged list with annotations. | ||
|
|
||
| This is the core diffing engine. It walks through both lists and the LCS | ||
| to identify added and removed blocks. | ||
| """ | ||
| lcs_nodes = find_longest_common_subsequence(before_blocks, after_blocks) | ||
| lcs_set = {json.dumps(node, sort_keys=True) for node in lcs_nodes} | ||
|
|
||
| merged_blocks: NodeList = [] | ||
|
|
||
| ptr_a, ptr_b = 0, 0 | ||
| while ptr_a < len(before_blocks) or ptr_b < len(after_blocks): | ||
| node_a = before_blocks[ptr_a] if ptr_a < len(before_blocks) else None | ||
| node_b = after_blocks[ptr_b] if ptr_b < len(after_blocks) else None | ||
|
|
||
| node_a_str = json.dumps(node_a, sort_keys=True) if node_a else None | ||
| node_b_str = json.dumps(node_b, sort_keys=True) if node_b else None | ||
|
|
||
| if node_a and node_a_str not in lcs_set: | ||
| # This node from 'before' is not in the LCS, so it was removed. | ||
| merged_blocks.append(add_diff_meta(node_a, "removed")) | ||
| ptr_a += 1 | ||
| elif node_b and node_b_str not in lcs_set: | ||
| # This node from 'after' is not in the LCS, so it was added. | ||
| merged_blocks.append(add_diff_meta(node_b, "added")) | ||
| ptr_b += 1 | ||
| elif node_a and node_b: | ||
| # Both nodes are present and part of the LCS path. | ||
| # Check if they are identical. If not, mark as changed. | ||
| if node_a_str != node_b_str: | ||
| # The content at this aligned position has changed. | ||
| # Mark the 'before' version as removed and 'after' as added. | ||
| # This is a common way to show a "change". | ||
| merged_blocks.append(add_diff_meta(node_a, "removed")) | ||
| merged_blocks.append(add_diff_meta(node_b, "added")) | ||
| else: | ||
| # The blocks are identical, add them without modification. | ||
| merged_blocks.append(node_a) | ||
| ptr_a += 1 | ||
| ptr_b += 1 | ||
| elif ptr_a < len(before_blocks): | ||
| # Exhausted 'after_blocks', remaining 'before' blocks are removals. | ||
| merged_blocks.append(add_diff_meta(before_blocks[ptr_a], "removed")) | ||
| ptr_a += 1 | ||
| elif ptr_b < len(after_blocks): | ||
| # Exhausted 'before_blocks', remaining 'after' blocks are additions. | ||
| merged_blocks.append(add_diff_meta(after_blocks[ptr_b], "added")) | ||
| ptr_b += 1 | ||
|
|
||
| return merged_blocks | ||
|
|
||
|
|
||
| def diff_ast_files(before_path, after_path, output_path): | ||
| """Read two Pandoc AST JSON files, diff their blocks, write the result.""" | ||
| with open(before_path, "r", encoding="utf-8") as f: | ||
| before_ast: PandocAst = json.load(f) | ||
|
|
||
| with open(after_path, "r", encoding="utf-8") as f: | ||
| after_ast: PandocAst = json.load(f) | ||
|
|
||
| before_blocks: NodeList = before_ast.get("blocks", []) | ||
| after_blocks: NodeList = after_ast.get("blocks", []) | ||
|
|
||
| merged_blocks = diff_block_lists(before_blocks, after_blocks) | ||
|
|
||
| output_ast: PandocAst = { | ||
| "pandoc-api-version": after_ast["pandoc-api-version"], | ||
| "meta": after_ast["meta"], | ||
| "blocks": merged_blocks, | ||
| } | ||
|
|
||
| with open(output_path, "w", encoding="utf-8") as f: | ||
| json.dump(output_ast, f, indent=2) | ||
|
|
||
| return output_ast | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,9 +7,12 @@ | |
| import os | ||
| import re | ||
| import time | ||
| import types | ||
| from pathlib import Path | ||
| from datetime import datetime | ||
| from typing import Dict | ||
| from typing import Dict, Optional, Union | ||
|
|
||
| from doc_build.ast_diff import diff_ast_files | ||
|
|
||
| try: | ||
| import yaml | ||
|
|
@@ -20,6 +23,8 @@ | |
| sys.exit("Python 3.10 or greater is required.") | ||
|
|
||
|
|
||
| MARKDOWN_FORMAT = "markdown-hard_line_breaks" | ||
|
|
||
| class _OneOrTwoArgsAction(argparse.Action): | ||
| def __call__(self, parser, namespace, values, option_string=None): | ||
| if values is not None and hasattr(values, "__len__") and len(values) > 2: | ||
|
|
@@ -89,8 +94,17 @@ def get_output(self, arguments, *args, **kwargs): | |
|
|
||
|
|
||
| class DocBuilder: | ||
| def __init__(self): | ||
| def __init__(self, *, repo_root: Optional[Union[Path, str]] = None): | ||
| super().__init__() | ||
| if repo_root is not None: | ||
| self._repo_root = Path(repo_root) | ||
| else: | ||
| self._repo_root = Path( | ||
| git.get_output( | ||
| ["rev-parse", "--show-toplevel"], | ||
| cwd=self._get_class_file().parent, | ||
| ).strip() | ||
| ) | ||
|
|
||
| # MARK: Target Functions | ||
| def build_docs(self, args): | ||
|
|
@@ -104,13 +118,16 @@ def build_docs(self, args): | |
| elif len(args.diff) > 2: | ||
| raise ValueError(f"At most 2 arguments for --diff - got {len(args.diff)}") | ||
| args.output.mkdir(parents=True, exist_ok=True) | ||
| self.get_artifacts_dir(args.output).mkdir(parents=True, exist_ok=True) | ||
|
|
||
| shutil.copytree( | ||
| self.get_specification_root(), | ||
| self.get_artifacts_dir(args.output), | ||
| dirs_exist_ok=True, | ||
| ) | ||
| combined = self.preprocess_build(args) | ||
| if args.diff: | ||
| combined = self.generate_combined_diff( | ||
| args, args.diff[0], args.diff[1] | ||
| ) | ||
| filename = combined.stem | ||
| else: | ||
| combined = self._setup_and_preprocess(args) | ||
| filename = self.get_file_base_name() | ||
|
|
||
| spec = self.get_metadata_defaults_file() | ||
| subtitle = self.get_subtitle(spec) | ||
|
|
@@ -164,7 +181,7 @@ def build_docs(self, args): | |
| "2", | ||
| "--standalone", | ||
| "--number-sections=true", | ||
| "--from=markdown-hard_line_breaks", | ||
| "--from", MARKDOWN_FORMAT, | ||
| "--pdf-engine=tectonic", | ||
| ] | ||
|
|
||
|
|
@@ -176,8 +193,6 @@ def build_docs(self, args): | |
| docx = None | ||
| html = None | ||
|
|
||
| filename = self.get_file_base_name() | ||
|
|
||
| if not args.no_html: | ||
| html = args.output / f"{filename}.html" | ||
| html_template = self.get_scripts_root() / "template" / "default.html5" | ||
|
|
@@ -313,6 +328,73 @@ def flatten(self, args, source, output, substitutions: Dict[str, str] = None): | |
| else: | ||
| out.write(line) | ||
|
|
||
| def _setup_and_preprocess(self, args): | ||
| """Copy specification into artifacts dir and run preprocess_build. Caller must ensure args.output exists.""" | ||
| shutil.copytree( | ||
| self.get_specification_root(), | ||
| self.get_artifacts_dir(args.output), | ||
| dirs_exist_ok=True, | ||
| ) | ||
| return self.preprocess_build(args) | ||
|
|
||
| def _build_combined_for_ref(self, args, ref, worktree_path, output_subdir): | ||
| """Build combined.md for a given ref using a temporary worktree. Removes worktree in finally.""" | ||
| worktree_path = Path(worktree_path) | ||
| output_dir = Path(args.output) / output_subdir | ||
| try: | ||
| git( | ||
| ["worktree", "add", str(worktree_path), ref], | ||
| cwd=self.get_repo_root(), | ||
| ) | ||
| builder = self.__class__(repo_root=worktree_path) | ||
| ref_args = types.SimpleNamespace( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is cool :) |
||
| output=output_dir, | ||
| no_draft=getattr(args, "no_draft", False), | ||
| only=getattr(args, "only", []), | ||
| exclude=getattr(args, "exclude", []), | ||
| ) | ||
| output_dir.mkdir(parents=True, exist_ok=True) | ||
| return builder._setup_and_preprocess(ref_args) | ||
| finally: | ||
| try: | ||
| git( | ||
| ["worktree", "remove", str(worktree_path)], | ||
| cwd=self.get_repo_root(), | ||
| ) | ||
| except subprocess.CalledProcessError: | ||
| pass | ||
|
|
||
| def generate_combined_diff(self, args, from_ref, to_ref): | ||
| """Build combined diff markdown from two refs; returns path to diff_X_Y.md.""" | ||
| from_short = self.resolve_ref(from_ref, short=True) | ||
| to_short = self.resolve_ref(to_ref, short=True) | ||
| diff_basename = f"diff_{from_short}_{to_short}" | ||
| diff_dir = args.output / "diff" | ||
| diff_dir.mkdir(parents=True, exist_ok=True) | ||
| worktree_from = diff_dir / "wt_from" | ||
| worktree_to = diff_dir / "wt_to" | ||
| combined_from = self._build_combined_for_ref( | ||
| args, from_ref, worktree_from, "diff_from" | ||
| ) | ||
| combined_to = self._build_combined_for_ref( | ||
| args, to_ref, worktree_to, "diff_to" | ||
| ) | ||
| ast_from = diff_dir / "ast_from.json" | ||
| ast_to = diff_dir / "ast_to.json" | ||
| pandoc(["-f", MARKDOWN_FORMAT, "-t", "json", "-o", ast_from, combined_from]) | ||
| pandoc(["-f", MARKDOWN_FORMAT, "-t", "json", "-o", ast_to, combined_to]) | ||
| diff_ast_path = diff_dir / f"{diff_basename}.json" | ||
| diff_ast_files( | ||
| str(ast_from), str(ast_to), str(diff_ast_path) | ||
| ) | ||
| # Not strictly necessary (Pandoc can take JSON as input), but converting | ||
| # to markdown unifies the pipeline with the non-diff path and eases debugging. | ||
| combined_diff_md = diff_dir / f"{diff_basename}.md" | ||
| pandoc( | ||
| ["-f", "json", "-t", MARKDOWN_FORMAT, "-o", combined_diff_md, diff_ast_path] | ||
| ) | ||
| return combined_diff_md | ||
|
|
||
| def clean_docs(self, args): | ||
| if args.output.exists(): | ||
| shutil.rmtree(args.output) | ||
|
|
@@ -490,8 +572,7 @@ def get_scripts_root(self) -> Path: | |
| return Path(__file__).resolve().parent | ||
|
|
||
| def get_repo_root(self) -> Path: | ||
| """Assumes that the repo root is two up from this root""" | ||
| return self._get_class_file().parent.parent | ||
| return self._repo_root | ||
|
|
||
| def get_specification_root(self) -> Path: | ||
| return self.get_repo_root() / "specification" | ||
|
|
@@ -512,10 +593,15 @@ def get_entry_point(self, args) -> Path: | |
|
|
||
| # MARK: Utility Functions | ||
|
|
||
| def resolve_ref(self, ref: str, short: bool = False) -> str: | ||
| """Resolve a git ref (branch, tag, hash) to a commit hash in the repo.""" | ||
| args = ["rev-parse", "--short", ref] if short else ["rev-parse", ref] | ||
| return git.get_output(args, cwd=self.get_repo_root()).strip() | ||
|
|
||
| def get_subtitle(self, defaults_file_path: Path): | ||
| with open(defaults_file_path, "r") as f: | ||
| spec_data = yaml.load(f, Loader=yaml.SafeLoader) | ||
| commit = git.get_output(["rev-parse", "--short", "HEAD"], cwd=self.get_repo_root()).strip() | ||
| commit = self.resolve_ref("HEAD", short=True) | ||
| subtitle = f"v{spec_data['metadata']['version']} ({commit})" | ||
| return subtitle | ||
|
|
||
|
|
@@ -604,7 +690,6 @@ def make_build_parser(self, subparsers): | |
| build_parser.add_argument( | ||
| "--no-draft", help="Do not add draft watermark", action="store_true" | ||
| ) | ||
| # TODO: implement support (ie, use 'args.diff') | ||
| build_parser.add_argument( | ||
| "--diff", | ||
| nargs="+", | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,13 @@ | ||
| #! /usr/bin/env python3 | ||
|
|
||
| from doc_build.doc_builder import DocBuilder | ||
| from pathlib import Path | ||
|
|
||
| test_root = Path(__file__).parent.parent | ||
|
|
||
| class MyDocBuilder(DocBuilder): | ||
| pass | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| MyDocBuilder().process_argparser() | ||
| MyDocBuilder(repo_root=test_root).process_argparser() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good find, thanks! :)