diff --git a/seafile-download.sh b/seafile-download.sh new file mode 100755 index 0000000..4bd2bbc --- /dev/null +++ b/seafile-download.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# Mirror Seafile client downloads from seafile.com/download. +# +# The official download page embeds direct OSS download links. +# This script fetches the page with wget, parses out OSS URLs with Python, +# downloads new/changed files atomically via wget, and removes stale local +# files (bounded by TUNASYNC_MAX_DELETE). +# +# wget is used (not curl or urllib) because the tunasync Docker bridge +# network can reach the Seafile AWS origin only via wget. +set -euo pipefail + +WORKDIR="${TUNASYNC_WORKING_DIR:-}" +UPSTREAM="${TUNASYNC_UPSTREAM_URL:-https://www.seafile.com/download/}" +MAX_DELETE="${TUNASYNC_MAX_DELETE:-50}" + +if [ -z "$WORKDIR" ]; then + echo "ERROR: TUNASYNC_WORKING_DIR not set" + exit 2 +fi + +mkdir -p "$WORKDIR" +cd "$WORKDIR" + +PAGE=$(mktemp -t seafile-page.XXXXXX.html) +trap 'rm -f "$PAGE"' EXIT + +echo "Fetching download page via wget..." +wget -qO "$PAGE" --timeout=30 --tries=3 "$UPSTREAM" || { + echo "ERROR: wget failed to fetch $UPSTREAM" + exit 1 +} + +python3 - "$WORKDIR" "$MAX_DELETE" "$UPSTREAM" "$PAGE" <<'PY' +import sys, os, urllib.parse, subprocess +from html.parser import HTMLParser + +WORKDIR = sys.argv[1] +MAX_DELETE = int(sys.argv[2]) +UPSTREAM = sys.argv[3] +PAGE = sys.argv[4] +OSS_PREFIX = "seafile-downloads.oss-cn-shanghai.aliyuncs.com" + +with open(PAGE, encoding="utf-8", errors="replace") as f: + html = f.read() + +class LinkExtractor(HTMLParser): + def __init__(self): + super().__init__() + self.urls = [] + def handle_starttag(self, tag, attrs): + if tag == "a": + for k, v in attrs: + if k == "href" and v: + self.urls.append(v) + +parser = LinkExtractor() +parser.feed(html) + +oss_urls = [] +for u in parser.urls: + full = u if u.startswith("http") else "https://www.seafile.com" + (u if u.startswith("/") else "/" + u) + if OSS_PREFIX in full and "seafile-server" not in full: + oss_urls.append(full) + +if not oss_urls: + print("ERROR: no OSS download links found on page", file=sys.stderr) + sys.exit(1) + +print(f"Found {len(oss_urls)} client download(s)", file=sys.stderr) + + +def safe_basename(url): + """Derive a safe filename from the URL, rejecting traversal/separators.""" + raw = url.rstrip("/").rsplit("/", 1)[-1] + name = urllib.parse.unquote(raw) + if not name or name in (".", "..") or "/" in name or "\\" in name or "\x00" in name: + return None + return name + + +remote_names = set() +url_to_name = {} +for url in oss_urls: + name = safe_basename(url) + if name is None: + print(f"ERROR: refusing to use suspicious filename derived from {url!r}", file=sys.stderr) + sys.exit(1) + remote_names.add(name) + url_to_name[url] = name + + +def remote_size_via_spider(url): + """Get Content-Length using wget --spider. Returns int or None.""" + r = subprocess.run( + ["wget", "--spider", "--timeout=30", "--tries=1", "-S", url], + capture_output=True, text=True) + if r.returncode != 0: + return None, r.stderr + size = None + for line in r.stderr.split("\n"): + if "Content-Length:" in line: + try: + size = int(line.split(":", 1)[1].strip()) + except ValueError: + pass + return size, r.stderr + + +# Download each file atomically via wget (curl fails to Seafile AWS IPs +# from the Docker bridge network; wget works). +new_files = [] +for url in oss_urls: + name = url_to_name[url] + target = os.path.join(WORKDIR, name) + tmp = target + ".tmp" + + remote_size, stderr = remote_size_via_spider(url) + if remote_size is None: + # Some CDNs strip Content-Length on chunked or 302 responses; we still + # need to know whether the URL itself is reachable. + if stderr and "200 OK" not in stderr and "remote file exists" not in stderr.lower(): + print(f"ERROR: spider {url}: {stderr[-200:]}", file=sys.stderr) + sys.exit(1) + # Fall through; we'll download and trust wget to validate. + + if (remote_size is not None + and os.path.exists(target) + and os.path.getsize(target) == remote_size): + continue + + print(f"Downloading: {name} ({remote_size} bytes)" if remote_size is not None + else f"Downloading: {name}", file=sys.stderr) + r = subprocess.run( + ["wget", "-q", "--timeout=30", "--tries=3", "-O", tmp, url], + capture_output=True, text=True) + if r.returncode != 0: + print(f"ERROR: download {url}: {r.stderr[-200:]}", file=sys.stderr) + if os.path.exists(tmp): + os.remove(tmp) + sys.exit(1) + + if remote_size is not None: + downloaded = os.path.getsize(tmp) + if downloaded != remote_size: + print(f"ERROR: short read for {name}: got {downloaded}, " + f"expected {remote_size}", file=sys.stderr) + os.remove(tmp) + sys.exit(1) + + os.replace(tmp, target) + new_files.append(name) + +# Only delete stale files after all downloads succeeded so a transient +# upstream issue cannot wipe the mirror. +local_files = [f for f in os.listdir(WORKDIR) + if os.path.isfile(os.path.join(WORKDIR, f))] +stale = [f for f in local_files if f not in remote_names and not f.endswith(".tmp")] +if len(stale) > MAX_DELETE: + print(f"WARNING: {len(stale)} stale files exceeds MAX_DELETE ({MAX_DELETE})", + file=sys.stderr) + sys.exit(1) +for f in stale: + fp = os.path.join(WORKDIR, f) + print(f"Deleting stale: {f}", file=sys.stderr) + os.remove(fp) + +print("Done.", file=sys.stderr) +PY