Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ dependencies = [
"vl-convert-python",
"selenium",
"cairosvg",
"maturin"
]
classifiers = [
"Programming Language :: Rust",
Expand All @@ -37,5 +36,5 @@ package-data = {"website_diff" = ["*.css", "*.js"]}

[tool.maturin]
features = ["pyo3/extension-module"]
module-name = "website_diff.htmldiff"
module-name = "website_diff._htmldiff_rs"

2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ fn _htmldiff(a: &str, b: &str) -> String {
}

#[pymodule]
#[pyo3(name="htmldiff")]
#[pyo3(name="_htmldiff_rs")]
fn htmldiff(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(_htmldiff, m)?)?;
Ok(())
Expand Down
186 changes: 186 additions & 0 deletions website_diff/_htmldiff_py.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
"""Pure Python HTML diff — fallback for when the Rust extension is unavailable.

Implements the same HTML tokenizer and Wu-Manber-Myers O(NP) diff algorithm
that the Rust extension provides, producing byte-identical output.
"""

import enum

__all__ = ["_htmldiff"]


# ---------------------------------------------------------------------------
# HTML tokenizer (equivalent to src/html.rs)
# ---------------------------------------------------------------------------

class _Mode(enum.Enum):
CHAR = 0
TAG = 1
WHITESPACE = 2


def _split_html(s: str) -> list[str]:
words: list[str] = []
start = 0
mode = _Mode.CHAR

for i, c in enumerate(s):
if mode == _Mode.CHAR:
if c == "<":
if start != i:
words.append(s[start:i])
start = i
mode = _Mode.TAG
elif c.isspace():
if start != i:
words.append(s[start:i])
start = i
mode = _Mode.WHITESPACE
elif mode == _Mode.TAG:
if c == ">":
words.append(s[start : i + 1])
start = i + 1
mode = _Mode.CHAR
elif mode == _Mode.WHITESPACE:
if c == "<":
if start != i:
words.append(s[start:i])
start = i
mode = _Mode.TAG
elif not c.isspace():
if start != i:
words.append(s[start:i])
start = i
mode = _Mode.CHAR

if start < len(s):
words.append(s[start:])
return words


# ---------------------------------------------------------------------------
# Wu-Manber-Myers O(NP) diff (equivalent to src/wu.rs)
# ---------------------------------------------------------------------------

class _Edit:
__slots__ = ()


class _Common(_Edit):
__slots__ = ("old", "new")

def __init__(self, old: int, new: int):
self.old = old
self.new = new


class _Add(_Edit):
__slots__ = ("new",)

def __init__(self, new: int):
self.new = new


class _Delete(_Edit):
__slots__ = ("old",)

def __init__(self, old: int):
self.old = old


def _diff(a: list[str], b: list[str]) -> list[_Edit]:
reverse = len(a) > len(b)
if reverse:
a, b = b, a

m = len(a)
n = len(b)
delta = n - m
offset = m + 1
sz = m + n + 3

ids: list[int] = [-1] * sz
points: list[tuple[int, int, int]] = [] # (x, y, prev)

def snake(k: int, fp1: int, fp2: int) -> int:
fp = max(fp1, fp2)
y = fp
x = fp - k
while x < m and y < n and a[x] == b[y]:
x += 1
y += 1
ko = k + offset
prev = ids[ko - 1] if fp1 >= fp2 else ids[ko + 1]
ids[ko] = len(points)
points.append((x, y, prev))
return y

fp = [-1] * sz
p = -1
delta_offset = delta + offset
while True:
p += 1
for k in range(-p, delta):
ko = k + offset
fp[ko] = snake(k, fp[ko - 1] + 1, fp[ko + 1])
for k in range(delta + p, delta, -1):
ko = k + offset
fp[ko] = snake(k, fp[ko - 1] + 1, fp[ko + 1])
fp[delta_offset] = snake(
delta, fp[delta_offset - 1] + 1, fp[delta_offset + 1]
)
if fp[delta_offset] >= n:
break

route: list[tuple[int, int]] = []
prev = ids[delta_offset]
while prev != -1:
px, py, prev = points[prev]
route.append((px, py))

ses: list[_Edit] = []
cx, cy = 0, 0
for px, py in reversed(route):
while cx < px or cy < py:
if py + cx > px + cy:
ses.append(_Delete(cy) if reverse else _Add(cy))
cy += 1
elif py + cx < px + cy:
ses.append(_Add(cx) if reverse else _Delete(cx))
cx += 1
else:
ses.append(
_Common(cy, cx) if reverse else _Common(cx, cy)
)
cx += 1
cy += 1
return ses


# ---------------------------------------------------------------------------
# HTML diff builder (equivalent to src/builder.rs)
# ---------------------------------------------------------------------------

def _htmldiff(old_html: str, new_html: str) -> str:
"""Return *new_html* with ``<ins>`` / ``<del>`` tags marking changes."""
old_words = _split_html(old_html)
new_words = _split_html(new_html)
edits = _diff(old_words, new_words)

parts: list[str] = []
for edit in edits:
if isinstance(edit, _Common):
parts.append(old_words[edit.old])
elif isinstance(edit, _Add):
word = new_words[edit.new]
if word.startswith("<") and not word.startswith("<img"):
parts.append(word)
else:
parts.extend(["<ins>", word, "</ins>"])
elif isinstance(edit, _Delete):
word = old_words[edit.old]
if word.startswith("<") and not word.startswith("<img"):
parts.append(word)
else:
parts.extend(["<del>", word, "</del>"])
return "".join(parts)
8 changes: 8 additions & 0 deletions website_diff/htmldiff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""HTML diff — uses the Rust extension if available, otherwise pure Python."""

try:
from website_diff._htmldiff_rs import _htmldiff
except ImportError:
from website_diff._htmldiff_py import _htmldiff

__all__ = ["_htmldiff"]