-
Notifications
You must be signed in to change notification settings - Fork 180
Rewrite proxmox.sh: replace apt-sync+lftp with tsumugu + /iso parser #206
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,42 +1,251 @@ | ||
| #!/bin/bash | ||
| # requires: wget, timeout | ||
| set -e | ||
| set -o pipefail | ||
| set -euo pipefail | ||
|
|
||
| _here=`dirname $(realpath $0)` | ||
| apt_sync="${_here}/apt-sync.py" | ||
| # Proxmox is intentionally not handled as a single root-level tsumugu sync. | ||
| # | ||
| # Why this custom wrapper exists: | ||
| # * download.proxmox.com has a normal nginx-style tree for /debian/ and /images/. | ||
| # * /iso/ is NOT a normal directory index. It is a custom HTML download page | ||
| # with product cards, buttons, and an external https://www.proxmox.com link. | ||
| # * tsumugu's nginx/apache/directory-lister parsers do not parse /iso/ safely. | ||
| # * tsumugu's fallback parser can discover /iso/ links, but in the current | ||
| # deployed version it reports ISO file sizes as 0 in list output and is much | ||
| # slower/less deterministic for production use. | ||
| # * Running tsumugu from the repository root with --exclude '^/iso/' is unsafe: | ||
| # excluded remote paths are still treated as stale local paths during cleanup, | ||
| # so an existing local /iso/ directory can be deleted. | ||
| # * Using --no-delete globally would protect /iso/, but would also stop cleanup | ||
| # for stale files under /debian/ and /images/, which is not acceptable. | ||
| # | ||
| # The safe design is therefore: | ||
| # 1. Let tsumugu fully own only the subtrees it can parse safely: /debian/ and | ||
| # /images/. Cleanup is still enabled inside those subtrees. | ||
| # 2. Handle /iso/ with a small purpose-built HTML link parser that performs | ||
| # HEAD checks, downloads to .tmp.* files, atomically replaces completed | ||
| # files, and deletes stale /iso/ files only within a bounded max-delete. | ||
| # | ||
| # If a future maintainer wants to remove this script, first prove with an | ||
| # isolated two-run test that tsumugu can parse /iso/, skip existing large ISO | ||
| # files correctly, and avoid deleting unrelated local data. | ||
|
|
||
| BASE_URL="${TUNASYNC_UPSTREAM_URL:-"http://download.proxmox.com"}" | ||
| BASE_PATH="${TUNASYNC_WORKING_DIR}" | ||
| UPSTREAM=${TUNASYNC_UPSTREAM_URL:-http://download.proxmox.com/} | ||
| WORKDIR=${TUNASYNC_WORKING_DIR:?TUNASYNC_WORKING_DIR is required} | ||
| THREADS=${TUNASYNC_TSUMUGU_THREADS:-1} | ||
| MAXDELETE=${TUNASYNC_TSUMUGU_MAXDELETE:-10000} | ||
| USERAGENT=${TUNASYNC_TSUMUGU_USERAGENT:-"tsumugu/$(tsumugu --version | tail -n1 | cut -d' ' -f2)"} | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. After computing USERAGENT we 'export TUNASYNC_TSUMUGU_USERAGENT="$USERAGENT"' so the Python /iso stage inherits the same value. The tsumugu invocation continues to pass --user-agent "$USERAGENT". Both stages now agree on UA. |
||
| export TUNASYNC_TSUMUGU_USERAGENT="$USERAGENT" | ||
| export NO_COLOR=1 | ||
|
|
||
| APT_PATH="${BASE_PATH}/debian" | ||
| PVE_PATH="${APT_PATH}/pve" | ||
| PBS_PATH="${APT_PATH}/pbs" | ||
| PBS_CLIENT_PATH="${APT_PATH}/pbs-client" | ||
| PMG_PATH="${APT_PATH}/pmg" | ||
| mkdir -p "$WORKDIR" | ||
| cd "$WORKDIR" | ||
|
|
||
| # === download deb packages ==== | ||
| # Sync one nginx-indexed subtree into its matching local subdirectory. | ||
| # | ||
| # IMPORTANT: The local workdir passed to tsumugu is "$WORKDIR/$name", not the | ||
| # repository root. This confines tsumugu's cleanup/deletion logic to that | ||
| # subtree. It must not be changed back to "$WORKDIR" unless /iso/ deletion has | ||
| # been proven safe again. | ||
| sync_subtree() { | ||
| local name="$1" | ||
| shift | ||
| mkdir -p "$WORKDIR/$name" | ||
| tsumugu sync \ | ||
| --timezone 0 \ | ||
| --user-agent "$USERAGENT" \ | ||
| --max-delete "$MAXDELETE" \ | ||
| --parser nginx \ | ||
| --threads "$THREADS" \ | ||
| "$@" \ | ||
| "${UPSTREAM%/}/$name/" "$WORKDIR/$name" | ||
| } | ||
|
|
||
| "$apt_sync" --delete "${BASE_URL}/debian/pve" @debian-current pve-no-subscription,pvetest amd64 "$PVE_PATH" | ||
| "$apt_sync" --delete "${BASE_URL}/debian/pbs" @debian-current pbs-no-subscription amd64 "$PBS_PATH" | ||
| "$apt_sync" --delete "${BASE_URL}/debian/pbs-client" @debian-current main amd64 "$PBS_CLIENT_PATH" | ||
| "$apt_sync" --delete "${BASE_URL}/debian/pmg" @debian-current pmg-no-subscription amd64 "$PMG_PATH" | ||
| # upstream directory structure | ||
| ln -sf pve/dists $APT_PATH/dists | ||
| echo "Debian finished" | ||
| # The Debian repository has a few upstream directories that are present in the | ||
| # HTML index but return 401 when listed. Exclude only those known-bad leaves, | ||
| # plus changelog files already excluded by the historical Proxmox config. | ||
| sync_subtree debian \ | ||
| --exclude '/devel/dists/.+changelog$' \ | ||
| --exclude '/pmg/dists/.+changelog$' \ | ||
| --exclude '^/dists/trixie/pve-test/binary-arm64/' \ | ||
| --exclude '^/pve/dists/trixie/pve-test/binary-arm64/' | ||
|
|
||
| # === download standalone files ==== | ||
| # /images/ is a normal nginx-indexed subtree and can be fully owned by tsumugu. | ||
| sync_subtree images | ||
|
|
||
| function sync_files() { | ||
| repo_url="$1" | ||
| repo_dir="$2" | ||
| python3 - <<'PY' | ||
| import email.utils | ||
| import html.parser | ||
| import os | ||
| import re | ||
| import shutil | ||
| import socket | ||
| import sys | ||
| import urllib.parse | ||
| import urllib.request | ||
| from pathlib import Path | ||
|
|
||
| [ ! -d "$repo_dir" ] && mkdir -p "$repo_dir" | ||
| cd $repo_dir | ||
| lftp "${repo_url}/" -e "mirror --verbose -P 5 --delete --only-newer; bye" | ||
| } | ||
| # /iso/ custom sync stage | ||
| # ------------------------ | ||
| # The official page http://download.proxmox.com/iso/ is a custom HTML page. | ||
| # It currently contains one external link to https://www.proxmox.com plus one | ||
| # same-host link for every published ISO-side artifact (*.iso, *.torrent, | ||
| # *.sha256, *.asc). We intentionally accept only same-host /iso/ links and | ||
| # conservative filenames, so a layout change or unexpected external link cannot | ||
| # cause arbitrary downloads. | ||
| # | ||
| # Completeness model: | ||
| # * remote_names is derived from the official /iso/ page's same-host links. | ||
| # * each remote file is HEADed to obtain Content-Length and Last-Modified. | ||
| # * existing local files with matching size are kept and their mtime updated. | ||
| # * changed/missing files are downloaded to .tmp.<name> first, then atomically | ||
| # renamed into place. | ||
| # * stale local files under /iso/ are removed only after all remote links have | ||
| # been processed, and only if the count is <= TUNASYNC_PROXMOX_ISO_MAXDELETE. | ||
| # | ||
| # This stage deliberately does not verify SHA256 itself; however the mirror was | ||
| # separately reviewed after implementation and all official *.sha256 files | ||
| # matched their local *.iso payloads. Future reviews can repeat that check. | ||
| socket.setdefaulttimeout(60) | ||
| base = os.environ.get('TUNASYNC_UPSTREAM_URL', 'http://download.proxmox.com/').rstrip('/') + '/iso/' | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. The Python stage now derives 'allowed_netloc = urllib.parse.urlparse(base).netloc' from TUNASYNC_UPSTREAM_URL and validates each candidate URL against that. The hardcoded 'download.proxmox.com' string is gone. |
||
| allowed_netloc = urllib.parse.urlparse(base).netloc | ||
| work = Path(os.environ['TUNASYNC_WORKING_DIR']) / 'iso' | ||
| work.mkdir(parents=True, exist_ok=True) | ||
| user_agent = os.environ.get('TUNASYNC_TSUMUGU_USERAGENT', 'tsumugu') | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as the line-35 reply: USERAGENT is exported into the Python stage's environment. |
||
| max_delete = int(os.environ.get('TUNASYNC_PROXMOX_ISO_MAXDELETE', '100')) | ||
|
|
||
| class LinkParser(html.parser.HTMLParser): | ||
| def __init__(self): | ||
| super().__init__() | ||
| self.hrefs = [] | ||
| def handle_starttag(self, tag, attrs): | ||
| if tag.lower() != 'a': | ||
| return | ||
| for k, v in attrs: | ||
| if k.lower() == 'href' and v: | ||
| self.hrefs.append(v) | ||
|
|
||
| req = urllib.request.Request(base, headers={'User-Agent': user_agent}) | ||
| with urllib.request.urlopen(req, timeout=60) as resp: | ||
| html = resp.read().decode('utf-8', 'replace') | ||
|
|
||
| parser = LinkParser() | ||
| parser.feed(html) | ||
| files = [] | ||
| seen = set() | ||
| for href in parser.hrefs: | ||
| url = urllib.parse.urljoin(base, href) | ||
| parsed = urllib.parse.urlparse(url) | ||
| if parsed.scheme not in ('http', 'https') or parsed.netloc != allowed_netloc: | ||
| continue | ||
|
Comment on lines
+135
to
+138
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as the line-107 reply. |
||
| if not parsed.path.startswith('/iso/'): | ||
| continue | ||
| name = urllib.parse.unquote(Path(parsed.path).name) | ||
| if not name or name in ('.', '..') or name.endswith('/'): | ||
| continue | ||
| # The page itself is not part of the downloadable ISO artifact set. Keeping | ||
| # it would make us mirror presentation HTML rather than repository content. | ||
| if name == 'index.html': | ||
| continue | ||
| # Keep filenames intentionally conservative. If Proxmox ever introduces | ||
| # names outside this set, review the page before broadening the regex. | ||
| if not re.match(r'^[A-Za-z0-9._+~:-]+$', name): | ||
| print(f'skip suspicious iso link: {name}', file=sys.stderr) | ||
| continue | ||
| if name not in seen: | ||
| seen.add(name) | ||
| files.append((name, urllib.parse.urljoin(base, urllib.parse.quote(name)))) | ||
|
|
||
| if not files: | ||
| raise SystemExit('no ISO files found on Proxmox ISO page') | ||
|
|
||
| print(f'proxmox iso: discovered {len(files)} files', flush=True) | ||
| remote_names = {name for name, _ in files} | ||
| errors = [] | ||
| for name, url in files: | ||
| target = work / name | ||
| print(f'proxmox iso: checking {name}', flush=True) | ||
| size = -1 | ||
| mtime = None | ||
| try: | ||
| head_req = urllib.request.Request(url, method='HEAD', headers={'User-Agent': user_agent}) | ||
| with urllib.request.urlopen(head_req, timeout=30) as resp: | ||
| size = int(resp.headers.get('Content-Length', '-1')) | ||
| lm = resp.headers.get('Last-Modified') | ||
|
Comment on lines
+169
to
+172
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a Range: bytes=0-0 GET fallback. When HEAD raises an exception we issue 'Range: bytes=0-0', extract Content-Range/Last-Modified, and proceed. Existing files are kept on transient failure; missing files still produce a non-zero exit at the end of the run. |
||
| if lm: | ||
| dt = email.utils.parsedate_to_datetime(lm) | ||
| if dt: | ||
| mtime = dt.timestamp() | ||
| except Exception as e: | ||
| # Some servers / CDNs reject HEAD with 4xx/405 even when GET works. | ||
| # Try a Range:bytes=0-0 GET so we still get Content-Range / Last-Modified | ||
| # and can decide whether the local file is up to date. | ||
| try: | ||
| range_req = urllib.request.Request( | ||
| url, | ||
| headers={'User-Agent': user_agent, 'Range': 'bytes=0-0'}, | ||
| ) | ||
| with urllib.request.urlopen(range_req, timeout=30) as resp: | ||
| cr = resp.headers.get('Content-Range', '') | ||
| if cr.startswith('bytes ') and '/' in cr: | ||
| try: | ||
| size = int(cr.rsplit('/', 1)[1]) | ||
| except ValueError: | ||
| pass | ||
| if size < 0: | ||
| cl = resp.headers.get('Content-Length') | ||
| if cl is not None: | ||
| try: | ||
| size = int(cl) | ||
| except ValueError: | ||
| pass | ||
| lm = resp.headers.get('Last-Modified') | ||
| if lm: | ||
| dt = email.utils.parsedate_to_datetime(lm) | ||
| if dt: | ||
| mtime = dt.timestamp() | ||
| except Exception as e2: | ||
| # A transient HEAD/GET failure for a file we already have should | ||
| # not delete or re-download the local file. A missing local file | ||
| # plus failure is recorded as an error so the job exits non-zero. | ||
| if target.exists(): | ||
| print(f'proxmox iso: HEAD/GET failed for existing {name}: {e}; ' | ||
| f'keeping local file', flush=True) | ||
| continue | ||
| print(f'proxmox iso: HEAD/GET failed for missing {name}: ' | ||
| f'{e!r} / {e2!r}', file=sys.stderr, flush=True) | ||
| errors.append(f'{name}: HEAD {e!r} / GET {e2!r}') | ||
| continue | ||
| if target.exists() and size >= 0 and target.stat().st_size == size: | ||
| print(f'proxmox iso: skipping {name}', flush=True) | ||
| if mtime: | ||
| os.utime(target, (mtime, mtime)) | ||
| continue | ||
| tmp = work / ('.tmp.' + name) | ||
| print(f'proxmox iso: downloading {name} ({size} bytes)', flush=True) | ||
| try: | ||
| get_req = urllib.request.Request(url, headers={'User-Agent': user_agent}) | ||
| with urllib.request.urlopen(get_req, timeout=60) as resp, tmp.open('wb') as out: | ||
| shutil.copyfileobj(resp, out, length=1024 * 1024) | ||
| if size >= 0 and tmp.stat().st_size != size: | ||
| raise RuntimeError(f'size mismatch: got {tmp.stat().st_size}, expected {size}') | ||
| tmp.replace(target) | ||
| if mtime: | ||
| os.utime(target, (mtime, mtime)) | ||
| except Exception as e: | ||
| tmp.unlink(missing_ok=True) | ||
| print(f'proxmox iso: download failed for {name}: {e}', file=sys.stderr, flush=True) | ||
| errors.append(f'{name}: GET {e}') | ||
| continue | ||
|
|
||
| sync_files "${BASE_URL}/images" "${BASE_PATH}/images" | ||
| sync_files "${BASE_URL}/iso" "${BASE_PATH}/iso" | ||
| if errors: | ||
| raise SystemExit('proxmox iso: completed with errors: ' + '; '.join(errors)) | ||
|
|
||
| echo "Proxmox finished" | ||
| # Delete only stale regular files directly under /iso/. This cannot affect | ||
| # /debian/ or /images/ because this stage never traverses outside work. | ||
| stale = [p for p in work.iterdir() if p.is_file() and not p.name.startswith('.tmp.') and p.name not in remote_names] | ||
| if len(stale) > max_delete: | ||
| raise SystemExit(f'proxmox iso: refusing to delete {len(stale)} stale files > max {max_delete}') | ||
| for p in stale: | ||
| print(f'proxmox iso: deleting stale {p.name}', flush=True) | ||
| p.unlink() | ||
| print('proxmox iso: finished', flush=True) | ||
| PY | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same as the line-2 reply: pipefail is now on.