Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/deliver-artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,4 @@ def main() -> int:


if __name__ == "__main__":
raise SystemExit(main())
raise SystemExit(main())
37 changes: 37 additions & 0 deletions scripts/i18n-detect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python3
"""Detect language from text using character frequency analysis."""

from __future__ import annotations
import re

SPANISH_MARKERS = {"á","é","í","ó","ú","ñ","ü","¿","¡","los","las","del","que","es","en","con","para","por","su","una"}
ENGLISH_MARKERS = {"the","and","of","to","is","in","it","you","that","was","for","are","as","be","this","have","from"}

def detect_language(text: str) -> str:
"""Detect language from text. Returns 'en' or 'es'."""
if not text:
return "en"
text_lower = text.lower()
words = set(re.findall(r"[a-záéíóúñü]+", text_lower))
if not words:
return "en"
es_score = len(words & SPANISH_MARKERS)
en_score = len(words & ENGLISH_MARKERS)
return "es" if es_score > en_score else "en"

def update_html_lang(html_path, lang: str) -> bool:
"""Update <html lang="..."> attribute in an HTML file."""
import re
html = html_path.read_text(encoding="utf-8")
if re.search(r'<html[^>]*lang=["\']' + lang + r'["\']', html, re.IGNORECASE):
return False # already correct
html = re.sub(r'<html([^>]*)lang=["\'][^"\']+["\']', f'<html\\1lang="{lang}"', html, flags=re.IGNORECASE)
if 'lang=' not in html:
html = re.sub(r'<html', f'<html lang="{lang}"', html, count=1)
html_path.write_text(html, encoding="utf-8")
return True

if __name__ == "__main__":
import sys
text = sys.stdin.read() if len(sys.argv) < 2 else open(sys.argv[1]).read()
print(detect_language(text))
27 changes: 27 additions & 0 deletions tests/test_i18n.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python3
"""Test i18n language detection and --lang flag."""
import subprocess, sys, json
from pathlib import Path

def test_detect_english():
r = subprocess.run([sys.executable, "-c", "from scripts.i18n_detect import detect_language; print(detect_language('The architecture consists of three main services'))"], capture_output=True, text=True, cwd="/tmp/html-explainer-clean")
assert r.returncode == 0 and r.stdout.strip() == "en"
print("PASS: English text detected as 'en'")

def test_detect_spanish():
r = subprocess.run([sys.executable, "-c", "from scripts.i18n_detect import detect_language; print(detect_language('La arquitectura consiste en tres servicios principales'))"], capture_output=True, text=True, cwd="/tmp/html-explainer-clean")
assert r.returncode == 0 and r.stdout.strip() == "es"
print("PASS: Spanish text detected as 'es'")

def test_phrases_coverage():
phrases = json.load(open("/tmp/html-explainer-clean/scripts/i18n-phrases.json"))
assert len(phrases["en"]) >= 20, f"Expected 20+ phrases, got {len(phrases["en"])}"
assert len(phrases["es"]) >= 20
assert set(phrases["en"].keys()) == set(phrases["es"].keys()), "ES and EN must have same keys"
print(f"PASS: i18n-phrases.json has {len(phrases["en"])} phrases in both EN and ES")

if __name__ == "__main__":
test_detect_english()
test_detect_spanish()
test_phrases_coverage()
print("\nAll i18n tests passed.")
Loading