Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/scripts/add-home-banner.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,12 @@ def main():
else:
print("No changed chapters found")

# Add banner to ALL HTML files in the directory, not just index.html
html_files = list(html_dir.glob('*.html'))
print(f"Found {len(html_files)} HTML files to process")

for html_file in html_files:
add_home_page_banner(html_file, changed_chapters)
# Add banner only to index.html (the home page)
index_html = html_dir / 'index.html'
if index_html.exists():
add_home_page_banner(index_html, changed_chapters)
else:
print(f"Warning: index.html not found in {html_dir}", file=sys.stderr)

if __name__ == '__main__':
main()
36 changes: 21 additions & 15 deletions .github/scripts/create-docx-tracked-changes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
import sys
import subprocess
from datetime import datetime, timezone
from pathlib import Path

def checkout_base_docx(base_ref='origin/gh-pages', target_dir='/tmp/base-docx'):
Expand Down Expand Up @@ -61,35 +62,39 @@ def create_docx_with_tracked_changes(old_docx_path, new_docx_path, output_path):
from docx.opc.constants import RELATIONSHIP_TYPE as RT
import difflib
import shutil


# Use the PR author from GitHub Actions env if available, else a generic label
revision_author = os.getenv('GITHUB_ACTOR', 'PR Preview')
revision_date = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')

# First, copy the new document to the output path
shutil.copy2(new_docx_path, output_path)

# Load the output document
output_doc = Document(output_path)

# Enable track changes in the document settings
settings = output_doc.settings
settings_element = settings.element

# Add trackRevisions element if it doesn't exist
track_revisions = settings_element.find(qn('w:trackRevisions'))
if track_revisions is None:
track_revisions = OxmlElement('w:trackRevisions')
settings_element.append(track_revisions)

# Load old and new documents for comparison
old_doc = Document(old_docx_path)
new_doc = Document(new_docx_path)

# Get paragraphs from both documents
old_paragraphs = [p.text for p in old_doc.paragraphs]
new_paragraphs = [p.text for p in new_doc.paragraphs]

# Use difflib to find differences at paragraph level
matcher = difflib.SequenceMatcher(None, old_paragraphs, new_paragraphs)
has_changes = False

# Process each operation
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == 'replace':
Expand All @@ -104,16 +109,16 @@ def create_docx_with_tracked_changes(old_docx_path, new_docx_path, output_path):
# Create an insertion revision mark
ins = OxmlElement('w:ins')
ins.set(qn('w:id'), str(idx))
ins.set(qn('w:author'), 'PR Preview')
ins.set(qn('w:date'), '2024-01-01T00:00:00Z')
ins.set(qn('w:author'), revision_author)
ins.set(qn('w:date'), revision_date)

# Wrap the run's content in the insertion mark
run_element = run._element
parent = run_element.getparent()
parent.insert(parent.index(run_element), ins)
parent.remove(run_element)
ins.append(run_element)

elif tag == 'insert':
# New paragraphs were added
has_changes = True
Expand All @@ -124,8 +129,8 @@ def create_docx_with_tracked_changes(old_docx_path, new_docx_path, output_path):
for run in para.runs:
ins = OxmlElement('w:ins')
ins.set(qn('w:id'), str(1000 + idx))
ins.set(qn('w:author'), 'PR Preview')
ins.set(qn('w:date'), '2024-01-01T00:00:00Z')
ins.set(qn('w:author'), revision_author)
ins.set(qn('w:date'), revision_date)

run_element = run._element
parent = run_element.getparent()
Expand Down Expand Up @@ -166,7 +171,8 @@ def create_docx_with_tracked_changes(old_docx_path, new_docx_path, output_path):
shutil.copy2(new_docx_path, output_path)
print(f" ✓ Copied DOCX without tracked changes as fallback")
return True
except:
except Exception as copy_err:
print(f" ✗ Could not copy DOCX as fallback: {copy_err}", file=sys.stderr)
return False

def process_docx_file(new_docx_path, base_docx_dir):
Expand Down
28 changes: 16 additions & 12 deletions .github/scripts/detect-changed-chapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,41 +27,44 @@ def checkout_base_files(base_ref='origin/gh-pages', target_dir='/tmp/base-files'
try:
# Fetch the gh-pages branch
result = subprocess.run(
['git', 'fetch', 'origin', 'gh-pages:gh-pages'],
check=False,
['git', 'fetch', 'origin', 'gh-pages:gh-pages'],
check=False,
capture_output=True,
text=True
text=True,
timeout=30
)

if result.returncode != 0:
print(f"Could not fetch gh-pages branch: {result.stderr}")
print("This is expected for:")
print(" - First PR to a new repository")
print(" - Repositories not using gh-pages for deployment")
return None

# List all HTML and DOCX files in gh-pages
result = subprocess.run(
['git', 'ls-tree', '-r', '--name-only', base_ref],
capture_output=True,
text=True,
check=False
check=False,
timeout=30
)

if result.returncode == 0:
files = [f for f in result.stdout.split('\n')
files = [f for f in result.stdout.split('\n')
if f.endswith('.html') or f.endswith('.docx')]

# Extract each file
for file in files:
output_path = target_path / file
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'wb') as f:
subprocess.run(
['git', 'show', f'{base_ref}:{file}'],
stdout=f,
check=False
check=False,
timeout=30
)

return target_path if files else None
Expand All @@ -81,7 +84,8 @@ def files_differ(file1, file2):
try:
with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
return f1.read() != f2.read()
except Exception:
except Exception as e:
print(f"Warning: could not compare {file1} and {file2}: {e}", file=sys.stderr)
return True

def main():
Expand Down
213 changes: 213 additions & 0 deletions IMPROVEMENT_AREAS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
# Potential Areas for Improvement

This document identifies potential areas for improvement in the qbt (Quarto Book Template) repository based on a comprehensive code analysis.

## 1. Code Quality & Error Handling

### Python Scripts - Exception Handling

**File**: `.github/scripts/create-docx-tracked-changes.py` (line 169)
- **Issue**: Bare `except:` clause without exception type
- **Problem**: Catches all exceptions including `KeyboardInterrupt` and `SystemExit`, masking critical errors
- **Recommendation**: Change to `except Exception as e:` and add proper error logging

**File**: `.github/scripts/detect-changed-chapters.py` (line 84)
- **Issue**: Generic `except Exception:` without logging
- **Problem**: Silent failures when file operations fail
- **Recommendation**: Add logging to help troubleshoot failures

**File**: `.github/scripts/highlight-html-changes.py` (lines 462-508)
- **Issue**: Multiple subprocess calls with `check=False` without checking return codes
- **Problem**: Git command failures are silently ignored
- **Recommendation**: Check `result.returncode` or add proper error logging

## 2. Hardcoded Configuration Values

**File**: `.github/scripts/add-home-banner.py` (line 50)
- **Issue**: Hardcoded DOCX filename `"UCD-SeRG-Lab-Manual-tracked-changes.docx"`
- **Problem**: Not template-agnostic; requires modification for each book
- **Recommendation**: Extract to environment variable or configuration file

**File**: `.github/scripts/add-home-banner.py` (line 105)
- **Issue**: Banner injected into ALL HTML files
- **Problem**: Clutters every chapter with changes indicator
- **Recommendation**: Only inject into `index.html` or make conditional

**File**: `.github/scripts/create-docx-tracked-changes.py` (lines 108, 127, 191)
- **Issue**: Hardcoded author name `'PR Preview'` and dates `'2024-01-01'`
- **Problem**: Unrealistic metadata for tracking changes
- **Recommendation**: Use actual PR date and contributor info from GitHub Actions

## 3. Documentation Gaps

### Missing Python Documentation
- **Files**: All `.github/scripts/*.py` files
- **Issue**: No docstrings or type hints
- **Problem**: Difficult for contributors to understand function signatures
- **Recommendation**: Add comprehensive docstrings with parameter and return types

### Missing Scripts README
- **Location**: `.github/scripts/` directory
- **Issue**: No README explaining what each script does
- **Problem**: Contributors can't understand or run scripts independently
- **Recommendation**: Create `.github/scripts/README.md` with usage examples

### Missing Troubleshooting Guide
- **File**: `CONTRIBUTING.md`
- **Issue**: No error handling documentation
- **Problem**: Contributors don't know how to troubleshoot workflow failures
- **Recommendation**: Add troubleshooting section

## 4. Potential Bugs and Logic Issues

**File**: `.github/scripts/detect-changed-chapters.py` (lines 117-140)
- **Issue**: Nested path lookups could match wrong files
- **Problem**: Unreliable when directory structure varies
- **Recommendation**: Standardize path handling with explicit glob patterns

**File**: `.github/scripts/highlight-html-changes.py` (lines 273-277)
- **Issue**: Uses regex for HTML manipulation with `count=1`
- **Problem**: Wrong element could be replaced if HTML content repeats
- **Recommendation**: Use HTML parser (e.g., BeautifulSoup) instead of regex

**File**: `.github/scripts/create-docx-tracked-changes.py` (lines 86-87)
- **Issue**: Compares only paragraph text, not formatting or tables
- **Problem**: Many document changes won't show as tracked
- **Recommendation**: Compare XML representation or use proper DOCX diffing tool

**File**: `.github/scripts/check-bibliography-dois.R` (lines 119-124)
- **Issue**: Bare scoping assignment `<<-` in error handler
- **Problem**: Fragile; could cause issues if structure changes
- **Recommendation**: Use proper return mechanisms

## 5. Security & Robustness Issues

**File**: `.github/scripts/highlight-html-changes.py` (line 14)
- **Issue**: HTML unescape without sanitization
- **Problem**: Could be exploited with malicious content
- **Recommendation**: Add HTML sanitization layer

**File**: `.github/scripts/detect-changed-chapters.py` (lines 29-48)
- **Issue**: Subprocess calls without timeout
- **Problem**: Could hang indefinitely on network issues
- **Recommendation**: Add timeout parameter (e.g., `timeout=30`)

**File**: `.github/scripts/create-docx-tracked-changes.py` (lines 156-170)
- **Issue**: Fallback to copy file without validation
- **Problem**: Could silently deploy corrupted DOCX
- **Recommendation**: Add validation before returning success

## 6. Testing Gaps

### Missing Unit Tests
- **Issue**: No unit tests for any Python scripts
- **Problem**: Regressions undetected; hard to refactor safely
- **Recommendation**: Create `tests/` directory with pytest tests for:
- File change detection logic
- HTML highlighting functions
- DOCX comparison
- JSON output generation

### Missing Integration Tests
- **Issue**: No integration tests for workflow execution
- **Problem**: Can't verify workflows work end-to-end
- **Recommendation**: Add workflow test action or manual test documentation

### Missing Test Fixtures
- **Issue**: No test data/fixtures
- **Problem**: Can't test with realistic sample documents
- **Recommendation**: Create test fixtures with sample .qmd, HTML, DOCX files

## 7. Workflow & Process Issues

**File**: `.github/workflows/preview.yml` (lines 54-62)
- **Issue**: Prints raw environment variables in logs
- **Problem**: Could expose sensitive data in public logs
- **Recommendation**: Remove debug output or use `::notice` instead

**File**: `.github/workflows/check-bibliography-dois.yml` (line 45)
- **Issue**: No timeout for R script with API calls
- **Problem**: Could hang indefinitely waiting for CrossRef API
- **Recommendation**: Add timeout to Rscript invocation

**File**: `.github/workflows/lint-project.yaml`
- **Issue**: Workflow name doesn't match filename
- **Problem**: Confusing for navigation and CI/CD tracking
- **Recommendation**: Use consistent naming

## 8. Code Maintainability & Refactoring

### Repeated Git Operations
- **Files**: `detect-changed-chapters.py`, `create-docx-tracked-changes.py`, `highlight-html-changes.py`
- **Issue**: Duplicate code for `git fetch`, `git ls-tree`, `git show`
- **Recommendation**: Create shared utility module `.github/scripts/git_utils.py`

### Repeated Path Handling
- **Files**: Multiple scripts use `Path().glob()` and `os.getenv()`
- **Recommendation**: Create `.github/scripts/config.py` for centralized configuration

### HTML Parsing Complexity
- **File**: `highlight-html-changes.py` (200+ lines of regex-based parsing)
- **Issue**: Fragile and hard to maintain
- **Recommendation**: Replace with proper HTML parsing using BeautifulSoup (`from bs4 import BeautifulSoup`) or `lxml`

## 9. Configuration & Usability Issues

**File**: `_quarto.yml` (line 6)
- **Issue**: Placeholder values like `"Your Book Title"`, `"Your Name"`
- **Problem**: Users forget to customize
- **Recommendation**: Add validation script or warning when placeholders detected

**File**: `lychee.toml` (lines 20-28)
- **Issue**: URL patterns use exact placeholders; incomplete regex
- **Problem**: URL checker may incorrectly validate URLs
- **Recommendation**: Improve regex patterns

**File**: `DESCRIPTION` (lines 6-7)
- **Issue**: Placeholder values for author/email
- **Problem**: Not valid R package metadata
- **Recommendation**: Add initialization script that prompts for values

## 10. Accessibility & Documentation

**File**: `styles.css`
- **Issue**: No CSS contrast validation for dark mode
- **Problem**: May not be accessible to users with visual impairments
- **Recommendation**: Document minimum contrast requirements; test with accessibility tools

**File**: `.github/copilot-instructions.md`
- **Issue**: May contain outdated instructions
- **Recommendation**: Review and update for current best practices

## 11. Dependency & Version Management

### Missing Python Requirements File
- **Files**: `.github/scripts/*.py` use external libraries
- **Issue**: No `requirements.txt`
- **Problem**: Cannot reproduce environment
- **Recommendation**: Create `.github/scripts/requirements.txt` with version pins

### Missing R Version Specification
- **File**: `check-bibliography-dois.R`
- **Issue**: Uses modern R syntax without version check
- **Problem**: Could fail on older R versions
- **Recommendation**: Add R version check at script start

## 12. Other Notable Issues

- **File**: `.github/workflows/summary.yml` - Uses experimental `actions/ai-inference` which may not be stable
- **File**: `README.md` - Very long; should be split into separate documentation files
- **File**: `.lintr.R` - Complex regex patterns not well-documented

## Priority Summary

| Priority | Category | Impact |
|----------|----------|--------|
| **High** | Error Handling | Bare except clauses mask critical errors |
| **High** | Configuration | Hardcoded values prevent template reuse |
| **High** | Security | Missing timeouts could cause hangs |
| **Medium** | Testing | Zero test coverage prevents safe refactoring |
| **Medium** | Documentation | Missing docstrings/type hints reduce maintainability |
| **Medium** | Maintainability | Code duplication increases maintenance burden |
| **Low** | Usability | Placeholder values could confuse users |
| **Low** | Process | Inconsistent naming/organization |
Loading