From 723ddd2946620a4fb57eed88bfc0139ed5c5c5c8 Mon Sep 17 00:00:00 2001 From: Shun Sakai Date: Sat, 31 Jan 2026 14:11:23 +0900 Subject: [PATCH] ci: Add Ruff as code formatter and linter --- .github/workflows/ci.yml | 22 +- .python-version | 1 + pyproject.toml | 13 + user-config.py | 4 +- utils/cleanuptasks.py | 13 +- utils/extracti18n.py | 47 +- utils/stats.py | 11 +- uv.lock | 44 ++ video2commons/backend/__init__.py | 4 +- video2commons/backend/categories/__init__.py | 58 +- video2commons/backend/download/__init__.py | 138 ++--- video2commons/backend/encode/__init__.py | 62 ++- video2commons/backend/encode/globals.py | 16 +- video2commons/backend/encode/transcode.py | 226 ++++---- video2commons/backend/encode/transcodejob.py | 315 ++++++----- video2commons/backend/subtitles/__init__.py | 205 +++---- video2commons/backend/upload/__init__.py | 111 ++-- video2commons/backend/user-config.py | 4 +- video2commons/backend/worker.py | 140 ++--- video2commons/config.py | 26 +- video2commons/exceptions.py | 3 +- video2commons/frontend/__init__.py | 4 +- video2commons/frontend/api.py | 541 +++++++++---------- video2commons/frontend/app.py | 178 +++--- video2commons/frontend/i18n.py | 56 +- video2commons/frontend/redisession.py | 31 +- video2commons/frontend/shared.py | 13 +- video2commons/frontend/upload.py | 40 +- video2commons/frontend/urlextract.py | 394 +++++++------- video2commons/frontend/wcqs.py | 96 ++-- video2commons/shared/stats.py | 58 +- video2commons/user-config.py | 4 +- www/python/src/app.py | 5 +- www/python/src/user-config.py | 4 +- 34 files changed, 1516 insertions(+), 1371 deletions(-) create mode 100644 .python-version create mode 100644 pyproject.toml create mode 100644 uv.lock diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 22409097..d2c953b6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -name: Validate JavaScript Build +name: CI on: pull_request: @@ -6,6 +6,7 @@ on: jobs: check: + name: Validate JavaScript Build runs-on: ubuntu-latest container: image: debian:bookworm @@ -61,3 +62,22 @@ jobs: git add video2commons/frontend/static/*.min.js video2commons/frontend/templates/*.min.html git commit -m "Update built files from CI" git push origin $GITHUB_HEAD_REF + + ruff: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup uv + uses: astral-sh/setup-uv@v7.2.1 + with: + python-version: "3.14" + enable-cache: true + + - name: Run Ruff + run: uv run ruff check --output-format=github . + + - name: Run the Ruff formatter + run: uv run ruff format --check . diff --git a/.python-version b/.python-version new file mode 100644 index 00000000..6324d401 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.14 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..0bb53817 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "video2commons" +version = "0.1.0" +requires-python = ">=3.14" + +[dependency-groups] +dev = ["ruff>=0.14.14"] + +[tool.ruff.lint] +ignore = ["E722"] + +[tool.ruff.format] +docstring-code-format = true diff --git a/user-config.py b/user-config.py index 6e909215..884b1ac1 100644 --- a/user-config.py +++ b/user-config.py @@ -4,7 +4,7 @@ """Pywikibot configs.""" -family = 'commons' -mylang = 'commons' +family = "commons" +mylang = "commons" socket_timeout = 30, 300 # chunked uploading unreliable diff --git a/utils/cleanuptasks.py b/utils/cleanuptasks.py index 01a33628..1d2038fe 100644 --- a/utils/cleanuptasks.py +++ b/utils/cleanuptasks.py @@ -23,21 +23,20 @@ import sys from redis import Redis -sys.path.append(os.path.dirname(os.path.realpath(__file__)) + - "/../video2commons") +sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../video2commons") from config import redis_pw, redis_host # NOQA redisconnection = Redis(host=redis_host, db=3, password=redis_pw) -for userkey in redisconnection.keys('tasks:*') + ['alltasks']: +for userkey in redisconnection.keys("tasks:*") + ["alltasks"]: for taskid in redisconnection.lrange(userkey, 0, -1): - if not redisconnection.exists('titles:' + taskid): + if not redisconnection.exists("titles:" + taskid): redisconnection.lrem(userkey, 0, taskid) print("delete %s from %s" % (taskid, userkey)) -for pattern in ['params:*', 'restarted:*']: # 'tasklock:*' +for pattern in ["params:*", "restarted:*"]: # 'tasklock:*' for key in redisconnection.keys(pattern): - taskid = key.split(':')[1] - if not redisconnection.exists('titles:' + taskid): + taskid = key.split(":")[1] + if not redisconnection.exists("titles:" + taskid): redisconnection.delete(key) print("delete %s" % (key)) diff --git a/utils/extracti18n.py b/utils/extracti18n.py index b98af854..db9178f7 100644 --- a/utils/extracti18n.py +++ b/utils/extracti18n.py @@ -27,50 +27,55 @@ import re import json -if not len(sys.argv) > 1 or '/messages' not in sys.argv[1]: - print(("usage: python " + sys.argv[0] + " \n\n" - " The path to mediawiki/languages/messages\n")) +if not len(sys.argv) > 1 or "/messages" not in sys.argv[1]: + print( + ( + "usage: python " + sys.argv[0] + " \n\n" + " The path to mediawiki/languages/messages\n" + ) + ) sys.exit(1) msgDir = sys.argv[1] -dest = os.path.dirname(os.path.realpath(__file__)) + \ - '/../video2commons/frontend/i18n-metadata' +dest = ( + os.path.dirname(os.path.realpath(__file__)) + + "/../video2commons/frontend/i18n-metadata" +) data = { - 'fallbacks': {}, - 'rtl': [], - 'alllangs': [], + "fallbacks": {}, + "rtl": [], + "alllangs": [], } rFallback = re.compile(r"fallback = '(.*?)'", re.I) -rIsRtl = re.compile(r'rtl = true', re.I) +rIsRtl = re.compile(r"rtl = true", re.I) for file in os.listdir(msgDir): filePath = msgDir + "/" + file - if file in ['.', '..'] or not os.path.isfile(filePath): + if file in [".", ".."] or not os.path.isfile(filePath): continue - with open(filePath, 'r') as openfile: + with open(filePath, "r") as openfile: content = openfile.read() - fileMatch = re.match(r'Messages(.*?)\.php', file) - source = fileMatch.group(1).lower().replace('_', '-') + fileMatch = re.match(r"Messages(.*?)\.php", file) + source = fileMatch.group(1).lower().replace("_", "-") contentMatch = rFallback.search(content) if contentMatch: - fallbacks = [s.strip() for s in contentMatch.group(1).split(',')] - data['fallbacks'][source] = \ - fallbacks if len(fallbacks) > 1 else fallbacks[0] + fallbacks = [s.strip() for s in contentMatch.group(1).split(",")] + data["fallbacks"][source] = fallbacks if len(fallbacks) > 1 else fallbacks[0] if rIsRtl.search(content): - data['rtl'].append(source) + data["rtl"].append(source) - data['alllangs'].append(source) + data["alllangs"].append(source) def _write(key): dest_file = dest + "/" + key + ".json" - with open(dest_file, 'w') as openfile: - json.dump(data[key], openfile, sort_keys=True, - indent=4, separators=(',', ': ')) + with open(dest_file, "w") as openfile: + json.dump(data[key], openfile, sort_keys=True, indent=4, separators=(",", ": ")) + for key in data: _write(key) diff --git a/utils/stats.py b/utils/stats.py index d72b8373..ed69bc2a 100644 --- a/utils/stats.py +++ b/utils/stats.py @@ -6,7 +6,7 @@ import sys import time -sys.path.insert(0, '/srv/v2c') +sys.path.insert(0, "/srv/v2c") from redis import Redis @@ -17,7 +17,7 @@ collect_worker_stats, get_worker_stats, release_write_lock, - update_worker_stats + update_worker_stats, ) # Stats are considered stale if they haven't been updated in 30 minutes. @@ -41,8 +41,8 @@ def main(): # Don't update stats if they've been updated recently by another job. existing_stats = get_worker_stats(app_conn) - if existing_stats and 'last_updated_by_job' in existing_stats: - if int(time.time()) - existing_stats['last_updated_by_job'] < STALE_SECS: + if existing_stats and "last_updated_by_job" in existing_stats: + if int(time.time()) - existing_stats["last_updated_by_job"] < STALE_SECS: print("Stats have been updated recently, skipping update.") return @@ -58,5 +58,6 @@ def main(): finally: release_write_lock(app_conn) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/uv.lock b/uv.lock new file mode 100644 index 00000000..3443e5fa --- /dev/null +++ b/uv.lock @@ -0,0 +1,44 @@ +version = 1 +revision = 3 +requires-python = ">=3.14" + +[[package]] +name = "ruff" +version = "0.14.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2e/06/f71e3a86b2df0dfa2d2f72195941cd09b44f87711cb7fa5193732cb9a5fc/ruff-0.14.14.tar.gz", hash = "sha256:2d0f819c9a90205f3a867dbbd0be083bee9912e170fd7d9704cc8ae45824896b", size = 4515732, upload-time = "2026-01-22T22:30:17.527Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/89/20a12e97bc6b9f9f68343952da08a8099c57237aef953a56b82711d55edd/ruff-0.14.14-py3-none-linux_armv6l.whl", hash = "sha256:7cfe36b56e8489dee8fbc777c61959f60ec0f1f11817e8f2415f429552846aed", size = 10467650, upload-time = "2026-01-22T22:30:08.578Z" }, + { url = "https://files.pythonhosted.org/packages/a3/b1/c5de3fd2d5a831fcae21beda5e3589c0ba67eec8202e992388e4b17a6040/ruff-0.14.14-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6006a0082336e7920b9573ef8a7f52eec837add1265cc74e04ea8a4368cd704c", size = 10883245, upload-time = "2026-01-22T22:30:04.155Z" }, + { url = "https://files.pythonhosted.org/packages/b8/7c/3c1db59a10e7490f8f6f8559d1db8636cbb13dccebf18686f4e3c9d7c772/ruff-0.14.14-py3-none-macosx_11_0_arm64.whl", hash = "sha256:026c1d25996818f0bf498636686199d9bd0d9d6341c9c2c3b62e2a0198b758de", size = 10231273, upload-time = "2026-01-22T22:30:34.642Z" }, + { url = "https://files.pythonhosted.org/packages/a1/6e/5e0e0d9674be0f8581d1f5e0f0a04761203affce3232c1a1189d0e3b4dad/ruff-0.14.14-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f666445819d31210b71e0a6d1c01e24447a20b85458eea25a25fe8142210ae0e", size = 10585753, upload-time = "2026-01-22T22:30:31.781Z" }, + { url = "https://files.pythonhosted.org/packages/23/09/754ab09f46ff1884d422dc26d59ba18b4e5d355be147721bb2518aa2a014/ruff-0.14.14-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c0f18b922c6d2ff9a5e6c3ee16259adc513ca775bcf82c67ebab7cbd9da5bc8", size = 10286052, upload-time = "2026-01-22T22:30:24.827Z" }, + { url = "https://files.pythonhosted.org/packages/c8/cc/e71f88dd2a12afb5f50733851729d6b571a7c3a35bfdb16c3035132675a0/ruff-0.14.14-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1629e67489c2dea43e8658c3dba659edbfd87361624b4040d1df04c9740ae906", size = 11043637, upload-time = "2026-01-22T22:30:13.239Z" }, + { url = "https://files.pythonhosted.org/packages/67/b2/397245026352494497dac935d7f00f1468c03a23a0c5db6ad8fc49ca3fb2/ruff-0.14.14-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:27493a2131ea0f899057d49d303e4292b2cae2bb57253c1ed1f256fbcd1da480", size = 12194761, upload-time = "2026-01-22T22:30:22.542Z" }, + { url = "https://files.pythonhosted.org/packages/5b/06/06ef271459f778323112c51b7587ce85230785cd64e91772034ddb88f200/ruff-0.14.14-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:01ff589aab3f5b539e35db38425da31a57521efd1e4ad1ae08fc34dbe30bd7df", size = 12005701, upload-time = "2026-01-22T22:30:20.499Z" }, + { url = "https://files.pythonhosted.org/packages/41/d6/99364514541cf811ccc5ac44362f88df66373e9fec1b9d1c4cc830593fe7/ruff-0.14.14-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1cc12d74eef0f29f51775f5b755913eb523546b88e2d733e1d701fe65144e89b", size = 11282455, upload-time = "2026-01-22T22:29:59.679Z" }, + { url = "https://files.pythonhosted.org/packages/ca/71/37daa46f89475f8582b7762ecd2722492df26421714a33e72ccc9a84d7a5/ruff-0.14.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb8481604b7a9e75eff53772496201690ce2687067e038b3cc31aaf16aa0b974", size = 11215882, upload-time = "2026-01-22T22:29:57.032Z" }, + { url = "https://files.pythonhosted.org/packages/2c/10/a31f86169ec91c0705e618443ee74ede0bdd94da0a57b28e72db68b2dbac/ruff-0.14.14-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:14649acb1cf7b5d2d283ebd2f58d56b75836ed8c6f329664fa91cdea19e76e66", size = 11180549, upload-time = "2026-01-22T22:30:27.175Z" }, + { url = "https://files.pythonhosted.org/packages/fd/1e/c723f20536b5163adf79bdd10c5f093414293cdf567eed9bdb7b83940f3f/ruff-0.14.14-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e8058d2145566510790eab4e2fad186002e288dec5e0d343a92fe7b0bc1b3e13", size = 10543416, upload-time = "2026-01-22T22:30:01.964Z" }, + { url = "https://files.pythonhosted.org/packages/3e/34/8a84cea7e42c2d94ba5bde1d7a4fae164d6318f13f933d92da6d7c2041ff/ruff-0.14.14-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e651e977a79e4c758eb807f0481d673a67ffe53cfa92209781dfa3a996cf8412", size = 10285491, upload-time = "2026-01-22T22:30:29.51Z" }, + { url = "https://files.pythonhosted.org/packages/55/ef/b7c5ea0be82518906c978e365e56a77f8de7678c8bb6651ccfbdc178c29f/ruff-0.14.14-py3-none-musllinux_1_2_i686.whl", hash = "sha256:cc8b22da8d9d6fdd844a68ae937e2a0adf9b16514e9a97cc60355e2d4b219fc3", size = 10733525, upload-time = "2026-01-22T22:30:06.499Z" }, + { url = "https://files.pythonhosted.org/packages/6a/5b/aaf1dfbcc53a2811f6cc0a1759de24e4b03e02ba8762daabd9b6bd8c59e3/ruff-0.14.14-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:16bc890fb4cc9781bb05beb5ab4cd51be9e7cb376bf1dd3580512b24eb3fda2b", size = 11315626, upload-time = "2026-01-22T22:30:36.848Z" }, + { url = "https://files.pythonhosted.org/packages/2c/aa/9f89c719c467dfaf8ad799b9bae0df494513fb21d31a6059cb5870e57e74/ruff-0.14.14-py3-none-win32.whl", hash = "sha256:b530c191970b143375b6a68e6f743800b2b786bbcf03a7965b06c4bf04568167", size = 10502442, upload-time = "2026-01-22T22:30:38.93Z" }, + { url = "https://files.pythonhosted.org/packages/87/44/90fa543014c45560cae1fffc63ea059fb3575ee6e1cb654562197e5d16fb/ruff-0.14.14-py3-none-win_amd64.whl", hash = "sha256:3dde1435e6b6fe5b66506c1dff67a421d0b7f6488d466f651c07f4cab3bf20fd", size = 11630486, upload-time = "2026-01-22T22:30:10.852Z" }, + { url = "https://files.pythonhosted.org/packages/9e/6a/40fee331a52339926a92e17ae748827270b288a35ef4a15c9c8f2ec54715/ruff-0.14.14-py3-none-win_arm64.whl", hash = "sha256:56e6981a98b13a32236a72a8da421d7839221fa308b223b9283312312e5ac76c", size = 10920448, upload-time = "2026-01-22T22:30:15.417Z" }, +] + +[[package]] +name = "video2commons" +version = "0.1.0" +source = { virtual = "." } + +[package.dev-dependencies] +dev = [ + { name = "ruff" }, +] + +[package.metadata] + +[package.metadata.requires-dev] +dev = [{ name = "ruff", specifier = ">=0.14.14" }] diff --git a/video2commons/backend/__init__.py b/video2commons/backend/__init__.py index 5cf83f24..71a83ab2 100644 --- a/video2commons/backend/__init__.py +++ b/video2commons/backend/__init__.py @@ -19,8 +19,6 @@ """videocommons backend.""" - - from video2commons.backend import worker -__all__ = ['worker'] +__all__ = ["worker"] diff --git a/video2commons/backend/categories/__init__.py b/video2commons/backend/categories/__init__.py index b137b4bb..5037ca31 100644 --- a/video2commons/backend/categories/__init__.py +++ b/video2commons/backend/categories/__init__.py @@ -24,18 +24,26 @@ def has_video_track(source: str) -> bool: """Check if a video has an audio track.""" - result = subprocess.run([ - ffprobe_location, - '-loglevel', 'error', - '-select_streams', 'v', - '-show_entries', 'stream=index,codec_type', - '-of', 'json', - source - ], capture_output=True, text=True) + result = subprocess.run( + [ + ffprobe_location, + "-loglevel", + "error", + "-select_streams", + "v", + "-show_entries", + "stream=index,codec_type", + "-of", + "json", + source, + ], + capture_output=True, + text=True, + ) if result.returncode == 0: - for stream in json.loads(result.stdout).get('streams', []): - if stream.get('codec_type') == 'video': + for stream in json.loads(result.stdout).get("streams", []): + if stream.get("codec_type") == "video": return True return False @@ -44,18 +52,26 @@ def has_video_track(source: str) -> bool: def has_audio_track(source: str) -> bool: """Check if a video has an audio track.""" - result = subprocess.run([ - ffprobe_location, - '-loglevel', 'error', - '-select_streams', 'a', - '-show_entries', 'stream=index,codec_type', - '-of', 'json', - source - ], capture_output=True, text=True) + result = subprocess.run( + [ + ffprobe_location, + "-loglevel", + "error", + "-select_streams", + "a", + "-show_entries", + "stream=index,codec_type", + "-of", + "json", + source, + ], + capture_output=True, + text=True, + ) if result.returncode == 0: - for stream in json.loads(result.stdout).get('streams', []): - if stream.get('codec_type') == 'audio': + for stream in json.loads(result.stdout).get("streams", []): + if stream.get("codec_type") == "audio": return True return False @@ -82,7 +98,7 @@ def get_inferable_categories(source: str) -> Set[str]: categories = set() if not has_audio_track(source): - categories.add('[[Category:Videos without audio]]') + categories.add("[[Category:Videos without audio]]") return categories diff --git a/video2commons/backend/download/__init__.py b/video2commons/backend/download/__init__.py index d04f6a4c..fe9d2623 100644 --- a/video2commons/backend/download/__init__.py +++ b/video2commons/backend/download/__init__.py @@ -17,8 +17,6 @@ """Wrapper around youtube-dl.""" - - import os from urllib.parse import urlparse @@ -31,15 +29,15 @@ def download( - url, ie_key, formats, subtitles, outputdir, - statuscallback=None, errorcallback=None + url, ie_key, formats, subtitles, outputdir, statuscallback=None, errorcallback=None ): """Download a video from url to outputdir.""" - if url.startswith('uploads:'): + if url.startswith("uploads:"): # FIXME; this should be a configuration variable - url = url.replace('uploads:', 'https://video2commons.toolforge.org/' - 'static/uploads/', 1) + url = url.replace( + "uploads:", "https://video2commons.toolforge.org/static/uploads/", 1 + ) ie_key = None url_blacklisted(url) @@ -47,103 +45,107 @@ def download( outputdir = os.path.abspath(outputdir) statuscallback = statuscallback or (lambda text, percent: None) errorcallback = errorcallback or (lambda text: None) - outtmpl = outputdir + '/dl.%(ext)s' + outtmpl = outputdir + "/dl.%(ext)s" params = { - 'format': formats, - 'outtmpl': outtmpl, - 'writedescription': True, - 'writeinfojson': True, - 'writesubtitles': subtitles, - 'writeautomaticsub': False, - 'subtitleslangs': ['all', '-live_chat'], - 'subtitlesformat': 'srt/ass/vtt/best', - 'cachedir': '/tmp/', - 'noplaylist': True, # not implemented in video2commons - 'postprocessors': [{ - 'key': 'FFmpegSubtitlesConvertor', - 'format': 'srt', - }], - 'max_filesize': 5 * (1 << 30), - 'retries': 10, - 'fragment_retries': 10, - 'prefer_ffmpeg': True, # avconv do not have srt encoder - 'prefer_free_formats': True, - 'logger': get_logger('celery.task.v2c.main.yt_dlp') + "format": formats, + "outtmpl": outtmpl, + "writedescription": True, + "writeinfojson": True, + "writesubtitles": subtitles, + "writeautomaticsub": False, + "subtitleslangs": ["all", "-live_chat"], + "subtitlesformat": "srt/ass/vtt/best", + "cachedir": "/tmp/", + "noplaylist": True, # not implemented in video2commons + "postprocessors": [ + { + "key": "FFmpegSubtitlesConvertor", + "format": "srt", + } + ], + "max_filesize": 5 * (1 << 30), + "retries": 10, + "fragment_retries": 10, + "prefer_ffmpeg": True, # avconv do not have srt encoder + "prefer_free_formats": True, + "logger": get_logger("celery.task.v2c.main.yt_dlp"), } - old_ua = std_headers['User-Agent'] - if ie_key == 'Youtube': + old_ua = std_headers["User-Agent"] + if ie_key == "Youtube": # HACK: Get equirectangular for 360° videos (ytdl-org/youtube-dl#15267) - std_headers['User-Agent'] = '' + std_headers["User-Agent"] = "" # https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies # https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp - params.update({ - 'cookiefile': tooldir + '/../cookies.txt', - 'username': youtube_user, - 'password': youtube_pass - }) + params.update( + { + "cookiefile": tooldir + "/../cookies.txt", + "username": youtube_user, + "password": youtube_pass, + } + ) last_percentage = [Ellipsis] def progresshook(d): - if d['status'] == 'downloading': - total = d.get('total_bytes') or d.get('total_bytes_estimate') - percentage = int(100.0 * d['downloaded_bytes'] / total)\ - if total else None + if d["status"] == "downloading": + total = d.get("total_bytes") or d.get("total_bytes_estimate") + percentage = int(100.0 * d["downloaded_bytes"] / total) if total else None if percentage != last_percentage[0]: last_percentage[0] = percentage statuscallback( - 'Downloading to ' + (d['tmpfilename'] or d['filename']), - percentage + "Downloading to " + (d["tmpfilename"] or d["filename"]), percentage ) - elif d['status'] == 'finished': - statuscallback('Postprocessing...', -1) - elif d['status'] == 'error': - errorcallback('Error raised by YoutubeDL') + elif d["status"] == "finished": + statuscallback("Postprocessing...", -1) + elif d["status"] == "error": + errorcallback("Error raised by YoutubeDL") - statuscallback('Creating YoutubeDL instance', -1) + statuscallback("Creating YoutubeDL instance", -1) try: # Not using provided ie_key because of the existance of extractors that # targets another extractor, such as TwitterIE. with yt_dlp.YoutubeDL(params) as dl: dl.add_progress_hook(progresshook) - statuscallback('Preprocessing...', -1) + statuscallback("Preprocessing...", -1) info = dl.extract_info(url, download=True, ie_key=None) except DownloadError: - params['cachedir'] = False - statuscallback('Download failed.' - ' creating YoutubeDL instance without local cache', -1) + params["cachedir"] = False + statuscallback( + "Download failed. creating YoutubeDL instance without local cache", -1 + ) with yt_dlp.YoutubeDL(params) as dl: dl.add_progress_hook(progresshook) info = dl.extract_info(url, download=True, ie_key=None) finally: - std_headers['User-Agent'] = old_ua + std_headers["User-Agent"] = old_ua - if info.get('webpage_url'): - url_blacklisted(info['webpage_url']) + if info.get("webpage_url"): + url_blacklisted(info["webpage_url"]) - filename = outtmpl % {'ext': info['ext']} + filename = outtmpl % {"ext": info["ext"]} if not os.path.isfile(filename): # https://github.com/rg3/youtube-dl/issues/8349 - filename = outtmpl % {'ext': 'mkv'} - assert os.path.isfile(filename), \ - 'Failed to determine the path of the downloaded video. ' + \ - 'Is the video too large?' + filename = outtmpl % {"ext": "mkv"} + assert os.path.isfile(filename), ( + "Failed to determine the path of the downloaded video. " + + "Is the video too large?" + ) ret = { - 'extractor': ie_key, - 'subtitles': {}, - 'target': filename, + "extractor": ie_key, + "subtitles": {}, + "target": filename, } - for key in info.get('subtitles', {}): + for key in info.get("subtitles", {}): # Postprocesed: converted to srt - filename = outtmpl % {'ext': key + '.srt'} + filename = outtmpl % {"ext": key + ".srt"} if os.path.isfile(filename): - ret['subtitles'][key] = filename + ret["subtitles"][key] = filename return ret @@ -151,6 +153,6 @@ def progresshook(d): def url_blacklisted(url): """Define download url blacklist.""" parseresult = urlparse(url) - if parseresult.scheme in ['http', 'https']: - if parseresult.netloc.endswith('.googlevideo.com'): - raise TaskError('Your downloading URL has been blacklisted.') + if parseresult.scheme in ["http", "https"]: + if parseresult.netloc.endswith(".googlevideo.com"): + raise TaskError("Your downloading URL has been blacklisted.") diff --git a/video2commons/backend/encode/__init__.py b/video2commons/backend/encode/__init__.py index 3a50099b..256565cd 100644 --- a/video2commons/backend/encode/__init__.py +++ b/video2commons/backend/encode/__init__.py @@ -21,6 +21,7 @@ from .transcodejob import WebVideoTranscodeJob from .transcode import WebVideoTranscode from .globals import ffmpeg_location, ffprobe_location + # https://github.com/senko/python-video-converter from converter import Converter @@ -28,7 +29,7 @@ def encode(source, origkey, statuscallback=None, errorcallback=None, concurrency=None): """Main encode function.""" source = os.path.abspath(source) - preserve = {'video': False, 'audio': False} + preserve = {"video": False, "audio": False} c = Converter(ffmpeg_path=ffmpeg_location, ffprobe_path=ffprobe_location) info = c.probe(source) @@ -38,14 +39,20 @@ def encode(source, origkey, statuscallback=None, errorcallback=None, concurrency targettype = WebVideoTranscode.settings.get(key) if info and targettype: - if info.video and info.video.codec == targettype.get('videoCodec'): - preserve['video'] = True - if info.audio and info.audio.codec == targettype.get('audioCodec'): - preserve['audio'] = True + if info.video and info.video.codec == targettype.get("videoCodec"): + preserve["video"] = True + if info.audio and info.audio.codec == targettype.get("audioCodec"): + preserve["audio"] = True - target = source + '.' + key + target = source + "." + key job = WebVideoTranscodeJob( - source, target, key, preserve, statuscallback, errorcallback, info, + source, + target, + key, + preserve, + statuscallback, + errorcallback, + info, concurrency, ) @@ -55,33 +62,40 @@ def encode(source, origkey, statuscallback=None, errorcallback=None, concurrency def getbestkey(info, targettype): """Find the bext convert key to use.""" # Asserts - assert info, 'The file format could not be recognized' - assert targettype, 'The target format is invalid.' - assert info.video or info.audio, 'The file has no video or audio tracks.' - assert info.video or not targettype.get('videoCodec'), \ - 'Video is asked to be kept but the file has no video tracks.' - assert info.audio or not targettype.get('audioCodec'), \ - 'Audio is asked to be kept but the file has no audio tracks.' - - if targettype.get('videoCodec') and targettype.get('audioCodec'): + assert info, "The file format could not be recognized" + assert targettype, "The target format is invalid." + assert info.video or info.audio, "The file has no video or audio tracks." + assert info.video or not targettype.get("videoCodec"), ( + "Video is asked to be kept but the file has no video tracks." + ) + assert info.audio or not targettype.get("audioCodec"), ( + "Audio is asked to be kept but the file has no audio tracks." + ) + + if targettype.get("videoCodec") and targettype.get("audioCodec"): # need both video & audio -- no codec change in video & audio for newkey, newtargettype in list(WebVideoTranscode.settings.items()): - if info.video.codec == newtargettype.get('videoCodec') and \ - info.audio.codec == newtargettype.get('audioCodec'): + if info.video.codec == newtargettype.get( + "videoCodec" + ) and info.audio.codec == newtargettype.get("audioCodec"): return newkey - elif targettype.get('videoCodec') and 'noaudio' in targettype: + elif targettype.get("videoCodec") and "noaudio" in targettype: # need video only -- no codec change in video & remove audio for newkey, newtargettype in list(WebVideoTranscode.settings.items()): - if info.video.codec == newtargettype.get('videoCodec') and \ - 'noaudio' in newtargettype: + if ( + info.video.codec == newtargettype.get("videoCodec") + and "noaudio" in newtargettype + ): return newkey - elif 'novideo' in targettype and targettype.get('audioCodec'): + elif "novideo" in targettype and targettype.get("audioCodec"): # need video only -- no codec change in audio & remove video for newkey, newtargettype in list(WebVideoTranscode.settings.items()): - if info.audio.codec == newtargettype.get('audioCodec') and \ - 'novideo' in newtargettype: + if ( + info.audio.codec == newtargettype.get("audioCodec") + and "novideo" in newtargettype + ): return newkey return None diff --git a/video2commons/backend/encode/globals.py b/video2commons/backend/encode/globals.py index 7235cc15..b502b361 100644 --- a/video2commons/backend/encode/globals.py +++ b/video2commons/backend/encode/globals.py @@ -39,26 +39,27 @@ # Maximum file size transcoding processes can create, in KB background_size_limit = 10 * 1024 * 1024 # 10GB # Number of threads to use in avconv for transcoding -ffmpeg_threads = __import__('multiprocessing').cpu_count() +ffmpeg_threads = __import__("multiprocessing").cpu_count() # Location of the avconv/ffmpeg binary (used to encode WebM and for thumbnails) -ffmpeg_location = '/mnt/nfs/labstore-secondary-project/gentoo-prefix/usr/bin/ffmpeg' -ffprobe_location = '/usr/bin/ffprobe' +ffmpeg_location = "/mnt/nfs/labstore-secondary-project/gentoo-prefix/usr/bin/ffmpeg" +ffprobe_location = "/usr/bin/ffprobe" def escape_shellarg(*args): """Escape shell arguments.""" import pipes + return " ".join([pipes.quote(str(arg)) for arg in args]) -def format_size(num, suffix='B'): +def format_size(num, suffix="B"): """Format the size with prefixes.""" # Source: StackOverflow - for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: + for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: if abs(num) < 1024.0: return "%3.1f%s%s" % (num, unit, suffix) num /= 1024.0 - return "%.1f%s%s" % (num, 'Y', suffix) + return "%.1f%s%s" % (num, "Y", suffix) def format_time(s): @@ -70,5 +71,4 @@ def format_time(s): def time_to_seconds(time): """Get the number of seconds from time expression.""" - return \ - sum([a * b for a, b in zip([3600, 60, 1], list(map(int, time.split(':'))))]) + return sum([a * b for a, b in zip([3600, 60, 1], list(map(int, time.split(":"))))]) diff --git a/video2commons/backend/encode/transcode.py b/video2commons/backend/encode/transcode.py index dbaded02..79af2ac9 100644 --- a/video2commons/backend/encode/transcode.py +++ b/video2commons/backend/encode/transcode.py @@ -36,131 +36,117 @@ class WebVideoTranscode: """ settings = { - 'ogv': - { - 'videoQuality': 7, - 'audioQuality': 6, - 'noUpscaling': 'True', - 'twopass': 'False', - 'optimize': 'True', - 'keyframeInterval': '128', - 'videoCodec': 'theora', - 'audioCodec': 'vorbis', - 'type': 'video/ogg codecs="theora, vorbis"', - }, - 'an.ogv': - { - 'videoQuality': 7, - 'noUpscaling': 'True', - 'twopass': 'False', - 'optimize': 'True', - 'keyframeInterval': '128', - 'videoCodec': 'theora', - 'noaudio': 'True', - 'type': 'video/ogg codecs="theora, vorbis"', - }, - + "ogv": { + "videoQuality": 7, + "audioQuality": 6, + "noUpscaling": "True", + "twopass": "False", + "optimize": "True", + "keyframeInterval": "128", + "videoCodec": "theora", + "audioCodec": "vorbis", + "type": 'video/ogg codecs="theora, vorbis"', + }, + "an.ogv": { + "videoQuality": 7, + "noUpscaling": "True", + "twopass": "False", + "optimize": "True", + "keyframeInterval": "128", + "videoCodec": "theora", + "noaudio": "True", + "type": 'video/ogg codecs="theora, vorbis"', + }, # WebM transcode: - 'webm': - { - 'crf': 10, - 'videoBitrate': '0', - 'audioQuality': 6, - 'noUpscaling': 'True', - 'twopass': 'True', - 'videoCodec': 'vp8', - 'audioCodec': 'vorbis', - 'type': 'video/webm codecs="vp8, vorbis"', - }, - 'an.webm': - { - 'crf': 10, - 'videoBitrate': '0', - 'noUpscaling': 'True', - 'twopass': 'True', - 'videoCodec': 'vp8', - 'noaudio': 'True', - 'type': 'video/webm codecs="vp8, vorbis"', - }, - + "webm": { + "crf": 10, + "videoBitrate": "0", + "audioQuality": 6, + "noUpscaling": "True", + "twopass": "True", + "videoCodec": "vp8", + "audioCodec": "vorbis", + "type": 'video/webm codecs="vp8, vorbis"', + }, + "an.webm": { + "crf": 10, + "videoBitrate": "0", + "noUpscaling": "True", + "twopass": "True", + "videoCodec": "vp8", + "noaudio": "True", + "type": 'video/webm codecs="vp8, vorbis"', + }, # WebM VP9 transcode: - 'vp9.webm': - { - 'crf': 35, - 'videoBitrate': '0', - 'audioBitrate': '128', - 'samplerate': '48000', - 'noUpscaling': 'True', - 'twopass': 'True', - 'altref': 'True', - 'videoCodec': 'vp9', - 'audioCodec': 'opus', - 'tileColumns': '4', - 'speed': '2', - 'quality': 'good', - 'type': 'video/webm codecs="vp9, opus"', - }, - 'an.vp9.webm': - { - 'crf': 35, - 'videoBitrate': '0', - 'noUpscaling': 'True', - 'twopass': 'True', - 'altref': 'True', - 'videoCodec': 'vp9', - 'noaudio': 'True', - 'tileColumns': '4', - 'speed': '2', - 'quality': 'good', - 'type': 'video/webm codecs="vp9, opus"', - }, - + "vp9.webm": { + "crf": 35, + "videoBitrate": "0", + "audioBitrate": "128", + "samplerate": "48000", + "noUpscaling": "True", + "twopass": "True", + "altref": "True", + "videoCodec": "vp9", + "audioCodec": "opus", + "tileColumns": "4", + "speed": "2", + "quality": "good", + "type": 'video/webm codecs="vp9, opus"', + }, + "an.vp9.webm": { + "crf": 35, + "videoBitrate": "0", + "noUpscaling": "True", + "twopass": "True", + "altref": "True", + "videoCodec": "vp9", + "noaudio": "True", + "tileColumns": "4", + "speed": "2", + "quality": "good", + "type": 'video/webm codecs="vp9, opus"', + }, # WebM AV1 transcode: # # Presets: https://gitlab.com/AOMediaCodec/SVT-AV1/-/blob/master/Docs/CommonQuestions.md#what-presets-do # Multipass: https://github.com/HandBrake/HandBrake/issues/4831#issuecomment-1546617210 - 'av1.webm': - { - 'audioBitrate': '128', - 'audioCodec': 'opus', - 'crf': 30, - 'preset': '6', - 'samplerate': '48000', - 'twopass': 'False', # twopass is not supported for AV1 with CRF - 'type': 'video/webm codecs="av01, opus"', - 'videoBitrate': '0', - 'videoCodec': 'av1', - }, - 'an.av1.webm': - { - 'crf': 30, - 'noaudio': 'True', - 'preset': '6', - 'twopass': 'False', # twopass is not supported for AV1 with CRF - 'type': 'video/webm codecs="av01, opus"', - 'videoBitrate': '0', - 'videoCodec': 'av1', - }, - + "av1.webm": { + "audioBitrate": "128", + "audioCodec": "opus", + "crf": 30, + "preset": "6", + "samplerate": "48000", + "twopass": "False", # twopass is not supported for AV1 with CRF + "type": 'video/webm codecs="av01, opus"', + "videoBitrate": "0", + "videoCodec": "av1", + }, + "an.av1.webm": { + "crf": 30, + "noaudio": "True", + "preset": "6", + "twopass": "False", # twopass is not supported for AV1 with CRF + "type": 'video/webm codecs="av01, opus"', + "videoBitrate": "0", + "videoCodec": "av1", + }, # Audio profiles - 'ogg': - { - 'audioCodec': 'vorbis', - 'audioQuality': '6', - 'samplerate': '44100', - 'channels': '2', - 'noUpscaling': 'True', - 'novideo': 'True', - 'type': 'audio/ogg codecs="vorbis"', - }, - 'opus': - { - 'audioCodec': 'opus', - 'audioBitrate': '128', - 'samplerate': '48000', - 'channels': '2', - 'noUpscaling': 'True', - 'novideo': 'True', - 'type': 'audio/ogg codecs="opus"', - }, + "ogg": { + "audioCodec": "vorbis", + "audioQuality": "6", + "samplerate": "44100", + "channels": "2", + "noUpscaling": "True", + "novideo": "True", + "type": 'audio/ogg codecs="vorbis"', + }, + "opus": { + "audioCodec": "opus", + "audioBitrate": "128", + "samplerate": "48000", + "channels": "2", + "noUpscaling": "True", + "novideo": "True", + "type": 'audio/ogg codecs="opus"', + }, } diff --git a/video2commons/backend/encode/transcodejob.py b/video2commons/backend/encode/transcodejob.py index 85fda75c..0d656d2f 100644 --- a/video2commons/backend/encode/transcodejob.py +++ b/video2commons/backend/encode/transcodejob.py @@ -35,9 +35,14 @@ import signal from .transcode import WebVideoTranscode from .globals import ( - background_priority, background_time_limit, background_memory_limit, - background_size_limit, ffmpeg_threads, ffmpeg_location, escape_shellarg, - time_to_seconds + background_priority, + background_time_limit, + background_memory_limit, + background_size_limit, + ffmpeg_threads, + ffmpeg_location, + escape_shellarg, + time_to_seconds, ) from video2commons.exceptions import TaskAbort @@ -47,15 +52,21 @@ class WebVideoTranscodeJob(object): """Job class.""" def __init__( - self, source, target, key, preserve={}, - statuscallback=None, errorcallback=None, source_info=None, - concurrency=None + self, + source, + target, + key, + preserve={}, + statuscallback=None, + errorcallback=None, + source_info=None, + concurrency=None, ): """Initialize the instance.""" self.source = os.path.abspath(source) self.target = os.path.abspath(target) self.key = key - self.preserve = {'video': False, 'audio': False} + self.preserve = {"video": False, "audio": False} self.preserve.update(preserve) self.statuscallback = statuscallback or (lambda text, percent: None) self.errorcallback = errorcallback or (lambda text: None) @@ -83,8 +94,8 @@ def get_file(self): @return File """ - if not hasattr(self, 'file'): - self.file = open(self.source, 'r') + if not hasattr(self, "file"): + self.file = open(self.source, "r") self.file.close() return self.file @@ -95,8 +106,8 @@ def get_target_path(self): @return string """ - if not hasattr(self, 'targetEncodeFile'): - self.targetEncodeFile = open(self.target, 'w') + if not hasattr(self, "targetEncodeFile"): + self.targetEncodeFile = open(self.target, "w") self.targetEncodeFile.close() return self.targetEncodeFile.name @@ -107,7 +118,7 @@ def get_source_path(self): @return string|bool """ - if not hasattr(self, 'sourceFilePath'): + if not hasattr(self, "sourceFilePath"): self.sourceFilePath = self.get_file().name return self.sourceFilePath @@ -132,7 +143,7 @@ def run(self): # Validate the file exists: if not file: - self.set_error(self.source + ': File not found ') + self.set_error(self.source + ": File not found ") return False # Validate the transcode key param: @@ -144,26 +155,26 @@ def run(self): return False # Validate the source exists: - if not self.get_source_path() or not \ - os.path.isfile(self.get_source_path()): - status = self.source + ': Source not found' + if not self.get_source_path() or not os.path.isfile(self.get_source_path()): + status = self.source + ": Source not found" self.set_error(status, transcode_key) return False options = WebVideoTranscode.settings[transcode_key] - if 'novideo' in options: - self.output("Encoding to audio codec: " + options['audioCodec']) + if "novideo" in options: + self.output("Encoding to audio codec: " + options["audioCodec"]) else: - self.output("Encoding to codec: " + options['videoCodec']) + self.output("Encoding to codec: " + options["videoCodec"]) # Check the codec see which encode method to call - if 'novideo' in options or self.preserve['video']: + if "novideo" in options or self.preserve["video"]: status = self.ffmpeg_encode(options) - elif options['videoCodec'] in ['vp8', 'vp9', 'h264', "av1"] or \ - (options['videoCodec'] == 'theora'): + elif options["videoCodec"] in ["vp8", "vp9", "h264", "av1"] or ( + options["videoCodec"] == "theora" + ): # Check for twopass: - if 'twopass' in options and options['twopass'] == 'True': + if "twopass" in options and options["twopass"] == "True": # ffmpeg requires manual two pass status = self.ffmpeg_encode(options, 1) if status and not isinstance(status, str): @@ -171,14 +182,14 @@ def run(self): else: status = self.ffmpeg_encode(options) else: - self.output('Error unknown codec:' + options['videoCodec']) - status = 'Error unknown target codec:' + options['videoCodec'] + self.output("Error unknown codec:" + options["videoCodec"]) + status = "Error unknown target codec:" + options["videoCodec"] self.remove_ffmpeg_log_files() # If status is oky and target does not exist, reset status if status is True and not os.path.isfile(self.get_target_path()): - status = 'Target does not exist: ' + self.get_target_path() + status = "Target does not exist: " + self.get_target_path() # If status is ok and target is larger than 0 bytes if status is True and os.path.getsize(self.get_target_path()) > 0: @@ -196,8 +207,8 @@ def remove_ffmpeg_log_files(self): if os.path.isdir(dir): for file in os.listdir(dir): log_path = os.path.abspath(dir + "/" + file) - ext = file.split('.')[-1] - if ext == 'log' and log_path.startswith(path): + ext = file.split(".")[-1] + if ext == "log" and log_path.startswith(path): os.unlink(log_path) def ffmpeg_encode(self, options, p=0): @@ -209,63 +220,68 @@ def ffmpeg_encode(self, options, p=0): @return bool|string """ if not os.path.isfile(self.get_source_path()): - return "source file is missing, " + self.get_source_path() + \ - ". Encoding failed." + return ( + "source file is missing, " + + self.get_source_path() + + ". Encoding failed." + ) # Set up the base command - cmd = escape_shellarg(ffmpeg_location) + ' -y -i ' + \ - escape_shellarg(self.get_source_path()) + cmd = ( + escape_shellarg(ffmpeg_location) + + " -y -i " + + escape_shellarg(self.get_source_path()) + ) cmd += " -max_muxing_queue_size 4096" - if 'vpre' in options: - cmd += ' -vpre ' + escape_shellarg(options['vpre']) + if "vpre" in options: + cmd += " -vpre " + escape_shellarg(options["vpre"]) # Copy non-standard custom metadata specific to mp4 and mov files container = self.source_info.format.format - if container == 'mov,mp4,m4a,3gp,3g2,mj2': - cmd += ' -movflags use_metadata_tags' + if container == "mov,mp4,m4a,3gp,3g2,mj2": + cmd += " -movflags use_metadata_tags" - cmd += ' -map_metadata 0' + cmd += " -map_metadata 0" - if 'novideo' in options: + if "novideo" in options: cmd += " -vn " - elif self.preserve['video']: + elif self.preserve["video"]: cmd += " -vcodec copy" - elif options['videoCodec'] == 'av1': + elif options["videoCodec"] == "av1": cmd += self.ffmpeg_add_av1_video_options(options, p) - elif options['videoCodec'] == 'vp8' or options['videoCodec'] == 'vp9': + elif options["videoCodec"] == "vp8" or options["videoCodec"] == "vp9": cmd += self.ffmpeg_add_webm_video_options(options, p) - elif options['videoCodec'] == 'h264': + elif options["videoCodec"] == "h264": cmd += self.ffmpeg_add_h264_video_options(options, p) - elif options['videoCodec'] == 'theora': + elif options["videoCodec"] == "theora": cmd += self.ffmpeg_add_theora_video_options(options, p) # Check for start time - if 'starttime' in options: - cmd += ' -ss ' + escape_shellarg(options['starttime']) + if "starttime" in options: + cmd += " -ss " + escape_shellarg(options["starttime"]) else: - options['starttime'] = 0 + options["starttime"] = 0 # Check for end time: - if 'endtime' in options: - cmd += ' -t ' + str(options['endtime']) - str(options['starttime']) + if "endtime" in options: + cmd += " -t " + str(options["endtime"]) - str(options["starttime"]) - if p == 1 or 'noaudio' in options: - cmd += ' -an' - elif self.preserve['audio']: + if p == 1 or "noaudio" in options: + cmd += " -an" + elif self.preserve["audio"]: cmd += " -acodec copy" else: cmd += self.ffmpeg_add_audio_options(options, p) if p != 0: cmd += " -pass " + escape_shellarg(p) - cmd += " -passlogfile " + \ - escape_shellarg(self.get_target_path() + '.log') + cmd += " -passlogfile " + escape_shellarg(self.get_target_path() + ".log") # And the output target: if p == 1: - cmd += ' /dev/null' + cmd += " /dev/null" else: cmd += " " + escape_shellarg(self.get_target_path()) @@ -275,8 +291,7 @@ def ffmpeg_encode(self, options, p=0): retval, shellOutput = self.run_shell_exec(cmd, track=p != 1) if int(retval) != 0: - return cmd + \ - "\nExitcode: " + str(retval) + return cmd + "\nExitcode: " + str(retval) return True @@ -291,8 +306,8 @@ def ffmpeg_add_h264_video_options(self, options, p): # Set the codec: cmd = " -threads " + str(self.ffmpeg_get_thread_count()) + " -vcodec libx264" - if 'videoBitrate' in options: - cmd += " -b " + escape_shellarg(options['videoBitrate']) + if "videoBitrate" in options: + cmd += " -b " + escape_shellarg(options["videoBitrate"]) # Output mp4 cmd += " -f mp4" @@ -306,26 +321,26 @@ def ffmpeg_add_av1_video_options(self, options, p): @param p @return string """ - cmd = ' -threads ' + str(self.ffmpeg_get_thread_count()) + cmd = " -threads " + str(self.ffmpeg_get_thread_count()) # libsvtav1-specific constant quality - if 'crf' in options: - cmd += " -crf " + escape_shellarg(options['crf']) + if "crf" in options: + cmd += " -crf " + escape_shellarg(options["crf"]) - if 'videoBitrate' in options: - if int(options['videoBitrate']) > 0: + if "videoBitrate" in options: + if int(options["videoBitrate"]) > 0: cmd += " -qmin 1 -qmax 63" - cmd += " -b:v " + escape_shellarg(int(options['videoBitrate']) * 1000) + cmd += " -b:v " + escape_shellarg(int(options["videoBitrate"]) * 1000) cmd += " -vcodec libsvtav1" # libsvtav1 ignores the -threads option, so we have to set it manually. - cmd += ' -svtav1-params lp=' + str(self.ffmpeg_get_thread_count()) + cmd += " -svtav1-params lp=" + str(self.ffmpeg_get_thread_count()) if p == 1: - cmd += ' -preset 12' # Make first pass faster - elif 'preset' in options: - cmd += ' -preset ' + escape_shellarg(options['preset']) + cmd += " -preset 12" # Make first pass faster + elif "preset" in options: + cmd += " -preset " + escape_shellarg(options["preset"]) cmd += " -f webm" @@ -339,67 +354,65 @@ def ffmpeg_add_webm_video_options(self, options, p): @param p @return string """ - cmd = ' -threads ' + str(self.ffmpeg_get_thread_count()) - if options['videoCodec'] == 'vp9': - cmd += ' -row-mt 1' + cmd = " -threads " + str(self.ffmpeg_get_thread_count()) + if options["videoCodec"] == "vp9": + cmd += " -row-mt 1" # check for presets: - if 'preset' in options: - if options['preset'] == "360p": + if "preset" in options: + if options["preset"] == "360p": cmd += " -vpre libvpx-360p" - elif options['preset'] == "720p": + elif options["preset"] == "720p": cmd += " -vpre libvpx-720p" - elif options['preset'] == "1080p": + elif options["preset"] == "1080p": cmd += " -vpre libvpx-1080p" # Check for video quality: - if 'videoQuality' in options and int(options['videoQuality']) >= 0: + if "videoQuality" in options and int(options["videoQuality"]) >= 0: # Map 0-10 to 63-0, higher values worse quality - quality = 63 - int(int(options['videoQuality']) / 10.0 * 63) + quality = 63 - int(int(options["videoQuality"]) / 10.0 * 63) cmd += " -qmin " + escape_shellarg(quality) cmd += " -qmax " + escape_shellarg(quality) # libvpx-specific constant quality or constrained quality # note the range is different between VP8 and VP9 - if 'crf' in options: - cmd += " -crf " + escape_shellarg(options['crf']) + if "crf" in options: + cmd += " -crf " + escape_shellarg(options["crf"]) # Check for video bitrate: - if 'videoBitrate' in options: + if "videoBitrate" in options: cmd += " -qmin 1 -qmax 51" - cmd += " -b:v " + escape_shellarg(int(options['videoBitrate']) * 1000) + cmd += " -b:v " + escape_shellarg(int(options["videoBitrate"]) * 1000) # Set the codec: - if options['videoCodec'] == 'vp9': + if options["videoCodec"] == "vp9": cmd += " -vcodec libvpx-vp9" - if 'tileColumns' in options: - cmd += ' -tile-columns ' + \ - escape_shellarg(options['tileColumns']) + if "tileColumns" in options: + cmd += " -tile-columns " + escape_shellarg(options["tileColumns"]) else: cmd += " -vcodec libvpx" - if 'altref' in options: - cmd += ' -auto-alt-ref 1' - cmd += ' -lag-in-frames 25' + if "altref" in options: + cmd += " -auto-alt-ref 1" + cmd += " -lag-in-frames 25" # Check for keyframeInterval - if 'keyframeInterval' in options: - cmd += ' -g ' + escape_shellarg(options['keyframeInterval']) - cmd += ' -keyint_min ' + \ - escape_shellarg(options['keyframeInterval']) + if "keyframeInterval" in options: + cmd += " -g " + escape_shellarg(options["keyframeInterval"]) + cmd += " -keyint_min " + escape_shellarg(options["keyframeInterval"]) - if 'deinterlace' in options: - cmd += ' -deinterlace' + if "deinterlace" in options: + cmd += " -deinterlace" if p == 1: # Make first pass faster... - cmd += ' -speed 4' - elif 'speed' in options: - cmd += ' -speed ' + escape_shellarg(options['speed']) + cmd += " -speed 4" + elif "speed" in options: + cmd += " -speed " + escape_shellarg(options["speed"]) # In libvpx quality sets a deadline on how long frames can be processed. - if 'quality' in options: - cmd += ' -quality ' + escape_shellarg(options['quality']) + if "quality" in options: + cmd += " -quality " + escape_shellarg(options["quality"]) # Output WebM cmd += " -f webm" @@ -416,31 +429,30 @@ def ffmpeg_add_theora_video_options(self, options, p): @param p @return string """ - cmd = ' -threads ' + str(self.ffmpeg_get_thread_count()) + cmd = " -threads " + str(self.ffmpeg_get_thread_count()) # Check for video quality: - if 'videoQuality' in options and int(options['videoQuality']) >= 0: - cmd += " -q:v " + escape_shellarg(options['videoQuality']) + if "videoQuality" in options and int(options["videoQuality"]) >= 0: + cmd += " -q:v " + escape_shellarg(options["videoQuality"]) # Check for video bitrate: - if 'videoBitrate' in options: + if "videoBitrate" in options: cmd += " -qmin 1 -qmax 51" - cmd += " -b:v " + escape_shellarg(int(options['videoBitrate']) * 1000) + cmd += " -b:v " + escape_shellarg(int(options["videoBitrate"]) * 1000) # Set the codec: cmd += " -vcodec theora" # Check for keyframeInterval - if 'keyframeInterval' in options: - cmd += ' -g ' + escape_shellarg(options['keyframeInterval']) - cmd += ' -keyint_min ' + \ - escape_shellarg(options['keyframeInterval']) + if "keyframeInterval" in options: + cmd += " -g " + escape_shellarg(options["keyframeInterval"]) + cmd += " -keyint_min " + escape_shellarg(options["keyframeInterval"]) - if 'deinterlace' in options: - cmd += ' -deinterlace' + if "deinterlace" in options: + cmd += " -deinterlace" - if 'framerate' in options: - cmd += ' -r ' + escape_shellarg(options['framerate']) + if "framerate" in options: + cmd += " -r " + escape_shellarg(options["framerate"]) # Output Ogg cmd += " -f ogg" @@ -455,34 +467,34 @@ def ffmpeg_add_audio_options(self, options, p): @param p @return string """ - cmd = '' - if 'audioQuality' in options: - cmd += " -aq " + escape_shellarg(options['audioQuality']) + cmd = "" + if "audioQuality" in options: + cmd += " -aq " + escape_shellarg(options["audioQuality"]) - if 'audioBitrate' in options: - cmd += ' -b:a ' + str(int(options['audioBitrate']) * 1000) + if "audioBitrate" in options: + cmd += " -b:a " + str(int(options["audioBitrate"]) * 1000) - if 'samplerate' in options: - cmd += " -ar " + escape_shellarg(options['samplerate']) + if "samplerate" in options: + cmd += " -ar " + escape_shellarg(options["samplerate"]) - if 'channels' in options: - cmd += " -ac " + escape_shellarg(options['channels']) + if "channels" in options: + cmd += " -ac " + escape_shellarg(options["channels"]) - if 'audioCodec' in options: + if "audioCodec" in options: encoders = { - 'vorbis': 'libvorbis', - 'opus': 'libopus', - 'mp3': 'libmp3lame', + "vorbis": "libvorbis", + "opus": "libopus", + "mp3": "libmp3lame", } - if options['audioCodec'] in encoders: - codec = encoders[options['audioCodec']] + if options["audioCodec"] in encoders: + codec = encoders[options["audioCodec"]] else: - codec = options['audioCodec'] + codec = options["audioCodec"] cmd += " -acodec " + escape_shellarg(codec) - if codec == 'aac': + if codec == "aac": # the aac encoder is currently "experimental" in libav 9? :P - cmd += ' -strict experimental' + cmd += " -strict experimental" else: # if no audio codec set use vorbis : cmd += " -acodec libvorbis " @@ -504,21 +516,36 @@ def run_shell_exec(self, cmd, track=True): @param cmd String Command to be run @return int, string """ - cmd = 'ulimit -f ' + escape_shellarg(background_size_limit) + ';' + \ - 'ulimit -v ' + escape_shellarg(background_memory_limit) + ';' + \ - 'nice -n ' + escape_shellarg(background_priority) + ' ' + \ - 'timeout ' + escape_shellarg(background_time_limit) + ' ' + \ - cmd + \ - ' 2>&1' + cmd = ( + "ulimit -f " + + escape_shellarg(background_size_limit) + + ";" + + "ulimit -v " + + escape_shellarg(background_memory_limit) + + ";" + + "nice -n " + + escape_shellarg(background_priority) + + " " + + "timeout " + + escape_shellarg(background_time_limit) + + " " + + cmd + + " 2>&1" + ) # Adapted from https://gist.github.com/marazmiki/3015621 process = subprocess.Popen( - cmd, stdin=None, stdout=subprocess.PIPE, stderr=None, - universal_newlines=True, shell=True, preexec_fn=os.setsid + cmd, + stdin=None, + stdout=subprocess.PIPE, + stderr=None, + universal_newlines=True, + shell=True, + preexec_fn=os.setsid, ) - re_duration = re.compile(r'Duration: (\d{2}:\d{2}:\d{2})') - re_position = re.compile(r'time=(\d{2}:\d{2}:\d{2})', re.I) + re_duration = re.compile(r"Duration: (\d{2}:\d{2}:\d{2})") + re_position = re.compile(r"time=(\d{2}:\d{2}:\d{2})", re.I) duration = None position = None @@ -542,9 +569,9 @@ def run_shell_exec(self, cmd, track=True): if position_match: position = time_to_seconds(position_match.group(1)) if duration and position: - newpercentage = min(int( - math.floor(100 * position / duration) - ), 100) + newpercentage = min( + int(math.floor(100 * position / duration)), 100 + ) if newpercentage != percentage: percentage = newpercentage @@ -557,4 +584,4 @@ def run_shell_exec(self, cmd, track=True): time.sleep(2) process.stdout.close() - return process.returncode, '' + return process.returncode, "" diff --git a/video2commons/backend/subtitles/__init__.py b/video2commons/backend/subtitles/__init__.py index 950c3ce7..f9c10e4a 100644 --- a/video2commons/backend/subtitles/__init__.py +++ b/video2commons/backend/subtitles/__init__.py @@ -34,12 +34,11 @@ def upload(site, filename, text, langcode, langname): """Upload subtitles to Wikimedia Commons.""" - page = pywikibot.Page(site, f'TimedText:{filename}.{langcode.lower()}.srt') + page = pywikibot.Page(site, f"TimedText:{filename}.{langcode.lower()}.srt") page.text = text if not page.exists(): page.save( - summary=f'Import {langname} subtitles for [[:File:{filename}]]', - minor=False + summary=f"Import {langname} subtitles for [[:File:{filename}]]", minor=False ) @@ -47,28 +46,36 @@ def get_container_subtitle_languages(filepath): """Returns subtitle languages contained in a video container.""" languages = set() - result = subprocess.run([ - ffprobe_location, - '-loglevel', 'error', - '-select_streams', 's', - '-show_entries', 'stream=index:stream_tags=language', - '-of', 'json', - filepath - ], capture_output=True, text=True) + result = subprocess.run( + [ + ffprobe_location, + "-loglevel", + "error", + "-select_streams", + "s", + "-show_entries", + "stream=index:stream_tags=language", + "-of", + "json", + filepath, + ], + capture_output=True, + text=True, + ) if result.returncode != 0: return set() - for stream in json.loads(result.stdout).get('streams', []): - has_language = 'tags' in stream and 'language' in stream['tags'] - has_index = 'index' in stream + for stream in json.loads(result.stdout).get("streams", []): + has_language = "tags" in stream and "language" in stream["tags"] + has_index = "index" in stream # Skip unlabelled subtitles that have no language tag. if not has_language or not has_index: continue try: - langcode = langcodes.standardize_tag(stream['tags']['language']) + langcode = langcodes.standardize_tag(stream["tags"]["language"]) except LanguageTagError: continue # Skip subtitles with invalid language tags. @@ -91,58 +98,67 @@ def get_subtitle_languages(subtitles): return languages -def upload_container_subtitles(filepath, filename, outputdir, username, statuscallback=None): + +def upload_container_subtitles( + filepath, filename, outputdir, username, statuscallback=None +): """Extract subtitles from a video container that supports it (e.g. mkv).""" statuscallback = statuscallback or (lambda text, percent: None) - statuscallback('Uploading subtitles...', -1) + statuscallback("Uploading subtitles...", -1) percent = 0 - result = subprocess.run([ - ffprobe_location, - '-loglevel', 'error', - '-select_streams', 's', - '-show_entries', 'stream=index:stream_tags=language', - '-of', 'json', - filepath - ], capture_output=True, text=True) + result = subprocess.run( + [ + ffprobe_location, + "-loglevel", + "error", + "-select_streams", + "s", + "-show_entries", + "stream=index:stream_tags=language", + "-of", + "json", + filepath, + ], + capture_output=True, + text=True, + ) if result.returncode != 0: statuscallback( - f'Failed to extract subtitles: {result.stderr or result.returncode}', - None + f"Failed to extract subtitles: {result.stderr or result.returncode}", None ) return subtitles = [] languages = set() - streams = json.loads(result.stdout).get('streams', []) + streams = json.loads(result.stdout).get("streams", []) if not streams: - statuscallback('No subtitles found in container', 100) + statuscallback("No subtitles found in container", 100) return - statuscallback(f'Extracting subtitles for {len(streams)} language(s)...', -1) + statuscallback(f"Extracting subtitles for {len(streams)} language(s)...", -1) # Extract all subtitles from the video container (0-50%). for stream in streams: - has_language = 'tags' in stream and 'language' in stream['tags'] - has_index = 'index' in stream + has_language = "tags" in stream and "language" in stream["tags"] + has_index = "index" in stream # Skip unlabelled subtitles that have no language tag. if not has_language or not has_index: percent += 50.0 / len(streams) - statuscallback('Skipping subtitles missing required tags', None) + statuscallback("Skipping subtitles missing required tags", None) continue try: - langcode = langcodes.standardize_tag(stream['tags']['language']) + langcode = langcodes.standardize_tag(stream["tags"]["language"]) except LanguageTagError: percent += 50.0 / len(streams) statuscallback( - f'Skipping subtitles with invalid language tag: {langcode}', - None + f"Skipping subtitles with invalid language tag: {langcode}", None ) continue # Skip subtitles with invalid language tags. @@ -151,50 +167,56 @@ def upload_container_subtitles(filepath, filename, outputdir, username, statusca if langcode in languages: percent += 50.0 / len(streams) statuscallback( - f'Skipping duplicate subtitles with language: {langcode}', - None + f"Skipping duplicate subtitles with language: {langcode}", None ) continue else: languages.add(langcode) langname = Language.make(language=langcode).display_name() - statuscallback(f'Extracting {langname} subtitles...', int(percent)) + statuscallback(f"Extracting {langname} subtitles...", int(percent)) - srt_filepath = os.path.join(outputdir, f'{filename}.{langcode.lower()}.srt') + srt_filepath = os.path.join(outputdir, f"{filename}.{langcode.lower()}.srt") # Write the subtitles to the output directory of the job. - result = subprocess.run([ - ffmpeg_location, - '-nostdin', - '-hide_banner', - '-loglevel', 'quiet', - '-i', filepath, - '-map', f'0:{stream["index"]}', - srt_filepath - ], capture_output=True, text=True) + result = subprocess.run( + [ + ffmpeg_location, + "-nostdin", + "-hide_banner", + "-loglevel", + "quiet", + "-i", + filepath, + "-map", + f"0:{stream['index']}", + srt_filepath, + ], + capture_output=True, + text=True, + ) percent += 50.0 / len(streams) if result.returncode != 0: statuscallback( f"Failed to extract '{langcode.lower()}' subtitles: {result.stderr or result.returncode}", - int(percent) + int(percent), ) continue subtitles.append((langcode, langname, srt_filepath)) if not subtitles: - statuscallback('No subtitles extracted successfully', 100) + statuscallback("No subtitles extracted successfully", 100) return # Attempt uploads only after successful extraction of all subtitles (50-100%). for langcode, langname, srt_filepath in subtitles: try: - statuscallback(f'Uploading {langname} subtitles...', int(percent)) + statuscallback(f"Uploading {langname} subtitles...", int(percent)) - with open(srt_filepath, 'rb') as f: + with open(srt_filepath, "rb") as f: text = f.read() # Try to first decode the subtitles as UTF-8 if possible rather @@ -205,11 +227,10 @@ def upload_container_subtitles(filepath, filename, outputdir, username, statusca text = decoded_text else: # It's not UTF-8, so try to detect the encoding. - encoding = chardet.detect(text)['encoding'] + encoding = chardet.detect(text)["encoding"] if not encoding: statuscallback( - f'Skipping subtitles with invalid encoding: {langcode}', - None + f"Skipping subtitles with invalid encoding: {langcode}", None ) continue @@ -217,43 +238,40 @@ def upload_container_subtitles(filepath, filename, outputdir, username, statusca text = text.decode(encoding) except Exception: statuscallback( - f'Skipping subtitles with invalid encoding: {langcode}', - None + f"Skipping subtitles with invalid encoding: {langcode}", None ) continue upload( - site=pywikibot.Site('commons', 'commons', user=username), + site=pywikibot.Site("commons", "commons", user=username), filename=filename, text=text, langcode=langcode, - langname=langname + langname=langname, ) percent += 50.0 / len(subtitles) - statuscallback(f'Finished uploading {langname} subtitles', int(percent)) + statuscallback(f"Finished uploading {langname} subtitles", int(percent)) except TaskAbort: raise except Exception as e: percent += 50.0 / len(subtitles) - statuscallback(f'{type(e).__name__}: {e}\n\n{traceback.format_exc()}', int(percent)) + statuscallback( + f"{type(e).__name__}: {e}\n\n{traceback.format_exc()}", int(percent) + ) def upload_subtitles( - subtitles, wikifilename, username, - statuscallback=None, errorcallback=None + subtitles, wikifilename, username, statuscallback=None, errorcallback=None ): """Convert and upload subtitles to corresponding TimedText pages.""" statuscallback = statuscallback or (lambda text, percent: None) errorcallback = errorcallback or (lambda text: None) - statuscallback('Uploading subtitles...', -1) + statuscallback("Uploading subtitles...", -1) percent = 0 - c = Converter( - ffmpeg_path=ffmpeg_location, - ffprobe_path=ffprobe_location - ) + c = Converter(ffmpeg_path=ffmpeg_location, ffprobe_path=ffprobe_location) for langcode, filename in list(subtitles.items()): try: @@ -261,74 +279,63 @@ def upload_subtitles( langcode = str(lang).lower() langdesc = lang.describe() - langname = langdesc['language'] - del langdesc['language'] + langname = langdesc["language"] + del langdesc["language"] if langdesc: - langname += ' (%s)' % ', '.join(list(langdesc.values())) + langname += " (%s)" % ", ".join(list(langdesc.values())) - statuscallback('Loading subtitles in ' + langname, int(percent)) - subtitletext = '' + statuscallback("Loading subtitles in " + langname, int(percent)) + subtitletext = "" info = c.probe(filename) if not info: continue if len(info.streams) != 1: continue - if info.streams[0].type != 'subtitle': + if info.streams[0].type != "subtitle": continue format = info.streams[0].codec - if format.lower() != 'subrip': - target = filename + '.srt' - cmd = [ - ffmpeg_location, - '-i', filename, - '-f', 'srt', - target - ] + if format.lower() != "subrip": + target = filename + ".srt" + cmd = [ffmpeg_location, "-i", filename, "-f", "srt", target] statuscallback("Running cmd: %s" % cmd, None) subprocess.check_call(cmd, stderr=None) filename = target - with open(filename, 'rb') as f: + with open(filename, "rb") as f: subtitletext = f.read() - subtitletext = subtitletext.decode( - chardet.detect(subtitletext)['encoding'] - ) + subtitletext = subtitletext.decode(chardet.detect(subtitletext)["encoding"]) percent += 50.0 / len(subtitles) - statuscallback( - 'Uploading subtitles in ' + langname, - int(percent) - ) + statuscallback("Uploading subtitles in " + langname, int(percent)) # ENSURE PYWIKIBOT OAUTH PROPERLY CONFIGURED! - site = pywikibot.Site('commons', 'commons', user=username) + site = pywikibot.Site("commons", "commons", user=username) upload( site=site, filename=wikifilename, text=subtitletext, langcode=langcode, - langname=langname + langname=langname, ) percent += 50.0 / len(subtitles) - statuscallback( - 'Finished processing subtitles in ' + langname, - int(percent) - ) + statuscallback("Finished processing subtitles in " + langname, int(percent)) except TaskAbort: raise except Exception as e: - statuscallback(f'{type(e).__name__}: {e} \n\n{traceback.format_exc()}', None) + statuscallback( + f"{type(e).__name__}: {e} \n\n{traceback.format_exc()}", None + ) pass def parse_utf8(bytestring): """Try to decode a bytestring as UTF-8, returning None on failure.""" try: - return bytestring.decode('utf-8') + return bytestring.decode("utf-8") except UnicodeDecodeError: return None diff --git a/video2commons/backend/upload/__init__.py b/video2commons/backend/upload/__init__.py index d1c05871..8d7b6396 100644 --- a/video2commons/backend/upload/__init__.py +++ b/video2commons/backend/upload/__init__.py @@ -34,8 +34,14 @@ def upload( - filename, wikifilename, sourceurl, http_host, filedesc, username, - statuscallback=None, errorcallback=None + filename, + wikifilename, + sourceurl, + http_host, + filedesc, + username, + statuscallback=None, + errorcallback=None, ): """Upload a file from filename to wikifilename.""" statuscallback = statuscallback or (lambda text, percent: None) @@ -45,54 +51,78 @@ def upload( if size < 1000000000: return upload_pwb( - filename, wikifilename, sourceurl, filedesc, username, - size, statuscallback, errorcallback + filename, + wikifilename, + sourceurl, + filedesc, + username, + size, + statuscallback, + errorcallback, ) elif size < (5 << 30): try: return upload_pwb( - filename, wikifilename, sourceurl, filedesc, username, - size, statuscallback, errorcallback + filename, + wikifilename, + sourceurl, + filedesc, + username, + size, + statuscallback, + errorcallback, ) except pywikibot.exceptions.APIError as e: - if 'stash' in e.code or e.code == 'backend-fail-internal': + if "stash" in e.code or e.code == "backend-fail-internal": upload_ss( - filename, wikifilename, http_host, filedesc, - statuscallback, errorcallback + filename, + wikifilename, + http_host, + filedesc, + statuscallback, + errorcallback, ) else: raise else: errorcallback( - 'Sorry, but files larger than 5GB can not be uploaded even ' + - 'with server-side uploading. This task may need manual ' + - ' intervention.' + "Sorry, but files larger than 5GB can not be uploaded even " + + "with server-side uploading. This task may need manual " + + " intervention." ) def upload_pwb( - filename, wikifilename, sourceurl, filedesc, username, - size, statuscallback, errorcallback + filename, + wikifilename, + sourceurl, + filedesc, + username, + size, + statuscallback, + errorcallback, ): """Upload with pywikibot.""" # ENSURE PYWIKIBOT OAUTH PROPERLY CONFIGURED! - site = pywikibot.Site('commons', 'commons', user=username) + site = pywikibot.Site("commons", "commons", user=username) page = pywikibot.FilePage(site, wikifilename) if page.exists(): - errorcallback('File already exists. Please choose another name.') + errorcallback("File already exists. Please choose another name.") - comment = 'Imported media from ' + sourceurl + comment = "Imported media from " + sourceurl chunked = (16 * (1 << 20)) if size >= 100000000 else 0 remaining_tries = MAX_RETRIES while True: if remaining_tries == MAX_RETRIES: - statuscallback('Uploading...', -1) + statuscallback("Uploading...", -1) elif remaining_tries > 1: - statuscallback(f'Retrying upload... ({remaining_tries} tries remaining)', -1) + statuscallback( + f"Retrying upload... ({remaining_tries} tries remaining)", -1 + ) elif remaining_tries == 1: - statuscallback(f'Retrying upload... ({remaining_tries} try remaining)', -1) + statuscallback(f"Retrying upload... ({remaining_tries} try remaining)", -1) if remaining_tries != MAX_RETRIES: exponential_backoff(remaining_tries) @@ -105,9 +135,9 @@ def upload_pwb( text=filedesc, chunk_size=chunked, asynchronous=bool(chunked), - ignore_warnings=['exists-normalized'], + ignore_warnings=["exists-normalized"], ): - errorcallback('Upload failed!') + errorcallback("Upload failed!") break # The upload completed successfully. except TaskError: @@ -126,18 +156,17 @@ def upload_pwb( if remaining_tries == 0: raise # No more retries, raise the error. - statuscallback('Upload success!', 100) + statuscallback("Upload success!", 100) return page.title(with_ns=False), page.full_url() def upload_ss( - filename, wikifilename, http_host, filedesc, - statuscallback, errorcallback + filename, wikifilename, http_host, filedesc, statuscallback, errorcallback ): """Prepare for server-side upload.""" # Get hash md5 = hashlib.md5() - with open(filename, 'rb') as f: + with open(filename, "rb") as f: while True: data = f.read(65536) if not data: @@ -145,21 +174,27 @@ def upload_ss( md5.update(data) # file name check - wikifilename = wikifilename.replace('/', '-').replace(' ', '_') - wikifilename = wikifilename.replace('\r\n', '_') - wikifilename = wikifilename.replace('\r', '_').replace('\n', '_') + wikifilename = wikifilename.replace("/", "-").replace(" ", "_") + wikifilename = wikifilename.replace("\r\n", "_") + wikifilename = wikifilename.replace("\r", "_").replace("\n", "_") - newfilename = '/srv/v2c/ssu/' + wikifilename + newfilename = "/srv/v2c/ssu/" + wikifilename remaining_tries = MAX_RETRIES while True: try: if remaining_tries == MAX_RETRIES: - statuscallback('Preparing for server-side upload...', -1) + statuscallback("Preparing for server-side upload...", -1) elif remaining_tries > 1: - statuscallback(f'Retrying server-side upload preparation... ({remaining_tries} tries remaining)', -1) + statuscallback( + f"Retrying server-side upload preparation... ({remaining_tries} tries remaining)", + -1, + ) elif remaining_tries == 1: - statuscallback(f'Retrying server-side upload preparation... ({remaining_tries} try remaining)', -1) + statuscallback( + f"Retrying server-side upload preparation... ({remaining_tries} try remaining)", + -1, + ) if remaining_tries != MAX_RETRIES: exponential_backoff(remaining_tries) @@ -173,16 +208,16 @@ def upload_ss( remaining_tries -= 1 if remaining_tries == 0: # No more retries, raise the error. - errorcallback('Upload failed: NFS share is likely overloaded') + errorcallback("Upload failed: NFS share is likely overloaded") - with open(newfilename + '.txt', 'w') as filedescfile: + with open(newfilename + ".txt", "w") as filedescfile: filedesc = filedesc.replace( - '[[Category:Uploaded with video2commons]]', - '[[Category:Uploaded with video2commons/Server-side uploads]]' + "[[Category:Uploaded with video2commons]]", + "[[Category:Uploaded with video2commons/Server-side uploads]]", ) filedescfile.write(filedesc) - fileurl = 'https://' + http_host + '/' + wikifilename + fileurl = "https://" + http_host + "/" + wikifilename raise NeedServerSideUpload(fileurl, md5.hexdigest()) diff --git a/video2commons/backend/user-config.py b/video2commons/backend/user-config.py index 6e909215..884b1ac1 100644 --- a/video2commons/backend/user-config.py +++ b/video2commons/backend/user-config.py @@ -4,7 +4,7 @@ """Pywikibot configs.""" -family = 'commons' -mylang = 'commons' +family = "commons" +mylang = "commons" socket_timeout = 30, 300 # chunked uploading unreliable diff --git a/video2commons/backend/worker.py b/video2commons/backend/worker.py index bf830d82..369c7b27 100644 --- a/video2commons/backend/worker.py +++ b/video2commons/backend/worker.py @@ -17,8 +17,6 @@ """video2commons backend worker.""" - - import os import sys import shutil @@ -37,19 +35,19 @@ from video2commons.backend import upload from video2commons.backend import subtitles as subtitleuploader from video2commons.config import ( - redis_pw, redis_host, consumer_key, consumer_secret, http_host + redis_pw, + redis_host, + consumer_key, + consumer_secret, + http_host, ) from video2commons.shared.stats import update_task_stats -redisurl = 'redis://:' + redis_pw + '@' + redis_host + ':6379/' -app = celery.Celery( - 'v2cbackend', - backend=redisurl + '1', - broker=redisurl + '2' -) +redisurl = "redis://:" + redis_pw + "@" + redis_host + ":6379/" +app = celery.Celery("v2cbackend", backend=redisurl + "1", broker=redisurl + "2") app.conf.result_expires = 30 * 24 * 3600 # 1 month -app.conf.accept_content = ['json'] +app.conf.accept_content = ["json"] app.conf.worker_prefetch_multiplier = 1 redisconnection = Redis(host=redis_host, db=3, password=redis_pw) @@ -58,27 +56,35 @@ class Stats: """Storage for task status.""" - text = '' + text = "" percent = 0 def get_worker_concurrency(): """Parse concurrency value from CELERYD_OPTS environment variable.""" - celeryd_opts = os.environ.get('CELERYD_OPTS', '') + celeryd_opts = os.environ.get("CELERYD_OPTS", "") - match = re.search(r'--concurrency[=\s]+(\d+)', celeryd_opts) + match = re.search(r"--concurrency[=\s]+(\d+)", celeryd_opts) if match: return int(match.group(1)) @app.task(bind=True, track_started=False, base=AbortableTask) def main( - self, url, ie_key, subtitles, filename, filedesc, - downloadkey, convertkey, username, oauth + self, + url, + ie_key, + subtitles, + filename, + filedesc, + downloadkey, + convertkey, + username, + oauth, ): """Main worker code.""" # Get a lock to prevent double-running with same task ID - lockkey = 'tasklock:' + self.request.id + lockkey = "tasklock:" + self.request.id if redisconnection.exists(lockkey): raise Ignore @@ -90,9 +96,9 @@ def main( pass # We don't want to fail the task if we can't update stats. # Check for 10G of disk space, refuse to run if it is unavailable - st = os.statvfs('/srv') + st = os.statvfs("/srv") if st.f_frsize * st.f_bavail < 10 << 30: - self.retry(max_retries=20, countdown=5*60) + self.retry(max_retries=20, countdown=5 * 60) assert False # should never reach here redisconnection.setex(lockkey, 7 * 24 * 3600, self.request.hostname) @@ -100,7 +106,7 @@ def main( # Generate temporary directory for task for i in range(10): # 10 tries id = os.urandom(8).hex() - outputdir = '/srv/v2c/output/' + id + outputdir = "/srv/v2c/output/" + id if not os.path.isdir(outputdir): os.makedirs(outputdir) break @@ -116,52 +122,56 @@ def statuscallback(text, percent): s.text = text if percent is not None: s.percent = percent - print('%d: %s' % (s.percent, s.text)) + print("%d: %s" % (s.percent, s.text)) - self.update_state( - state='PROGRESS', - meta={'text': s.text, 'percent': s.percent} - ) + self.update_state(state="PROGRESS", meta={"text": s.text, "percent": s.percent}) def errorcallback(text): raise TaskError(text) try: - statuscallback('Downloading...', -1) + statuscallback("Downloading...", -1) d = download.download( - url, ie_key, downloadkey, subtitles, - outputdir, statuscallback, errorcallback + url, + ie_key, + downloadkey, + subtitles, + outputdir, + statuscallback, + errorcallback, ) if not d: - errorcallback('Download failed!') - file = d['target'] + errorcallback("Download failed!") + file = d["target"] if not file: - errorcallback('Download failed!') + errorcallback("Download failed!") source = file # Remember intent with subtitles so categories can be added # appropriately later. These can be strings, so convert to bool. subtitles_requested = subtitles - if type(subtitles_requested) == str: - subtitles_requested = subtitles_requested.lower() == 'true' + if type(subtitles_requested) is str: + subtitles_requested = subtitles_requested.lower() == "true" - subtitles = subtitles and d['subtitles'] + subtitles = subtitles and d["subtitles"] - statuscallback('Converting...', -1) + statuscallback("Converting...", -1) concurrency = get_worker_concurrency() file = encode.encode( file, convertkey, statuscallback, errorcallback, concurrency ) if not file: - errorcallback('Convert failed!') - ext = file.split('.')[-1] + errorcallback("Convert failed!") + ext = file.split(".")[-1] - statuscallback('Configuring Pywikibot...', -1) - pywikibot.config.authenticate['commons.wikimedia.org'] = \ - (consumer_key, consumer_secret) + tuple(oauth) - pywikibot.config.usernames['commons']['commons'] = username - pywikibot.Site('commons', 'commons', user=username).login() + statuscallback("Configuring Pywikibot...", -1) + pywikibot.config.authenticate["commons.wikimedia.org"] = ( + consumer_key, + consumer_secret, + ) + tuple(oauth) + pywikibot.config.usernames["commons"]["commons"] = username + pywikibot.Site("commons", "commons", user=username).login() # Identify the language codes of all present subtitles. Fallback to # checking the container ONLY IF yt-dlp was unable to find subtitles. @@ -169,28 +179,37 @@ def errorcallback(text): if subtitles: found_langcodes.update(subtitleuploader.get_subtitle_languages(subtitles)) elif subtitles_requested: - found_langcodes.update(subtitleuploader.get_container_subtitle_languages(source)) + found_langcodes.update( + subtitleuploader.get_container_subtitle_languages(source) + ) # Add additional inferable meta-categories to the file description. found_categories = set() found_categories.update(categories.get_inferable_categories(file)) - found_categories.update(categories.get_subtitle_categories(file, found_langcodes)) + found_categories.update( + categories.get_subtitle_categories(file, found_langcodes) + ) filedesc = categories.append_categories(filedesc, found_categories) - statuscallback('Uploading...', -1) - filename += '.' + ext + statuscallback("Uploading...", -1) + filename += "." + ext filename, wikifileurl = upload.upload( - file, filename, url, http_host, - filedesc, username, statuscallback, errorcallback + file, + filename, + url, + http_host, + filedesc, + username, + statuscallback, + errorcallback, ) if not wikifileurl: - errorcallback('Upload failed!') + errorcallback("Upload failed!") if subtitles: try: subtitleuploader.upload_subtitles( - subtitles, filename, username, - statuscallback, errorcallback + subtitles, filename, username, statuscallback, errorcallback ) except TaskAbort: raise @@ -207,7 +226,7 @@ def errorcallback(text): filename=filename, outputdir=outputdir, username=username, - statuscallback=statuscallback + statuscallback=statuscallback, ) except TaskAbort: raise @@ -218,23 +237,22 @@ def errorcallback(text): except NeedServerSideUpload as e: # json serializer cannot properly serialize an exception # without losing data, so we change the exception into a dict. - return {'type': 'ssu', 'hashsum': e.hashsum, 'url': e.url} + return {"type": "ssu", "hashsum": e.hashsum, "url": e.url} except pywikibot.exceptions.Error: exc_info = sys.exc_info() raise TaskError( - ( - 'pywikibot.Error: %s: %s' % ( - exc_info[0].__name__, exc_info[1] - ) - ).encode('utf-8')).with_traceback(exc_info[2]) + ("pywikibot.Error: %s: %s" % (exc_info[0].__name__, exc_info[1])).encode( + "utf-8" + ) + ).with_traceback(exc_info[2]) else: - statuscallback('Done!', 100) - return {'type': 'done', 'filename': filename, 'url': wikifileurl} + statuscallback("Done!", 100) + return {"type": "done", "filename": filename, "url": wikifileurl} finally: - statuscallback('Cleaning up...', -1) + statuscallback("Cleaning up...", -1) pywikibot.stopme() pywikibot.config.authenticate.clear() - pywikibot.config.usernames['commons'].clear() + pywikibot.config.usernames["commons"].clear() pywikibot._sites.clear() shutil.rmtree(outputdir) diff --git a/video2commons/config.py b/video2commons/config.py index 35af95bf..0ddae49c 100644 --- a/video2commons/config.py +++ b/video2commons/config.py @@ -11,20 +11,20 @@ tooldir = _os.path.dirname(_os.path.realpath(__file__)) if tooldir.startswith("/workspace"): # we are in buildpack tooldir = _os.path.expandvars("$TOOL_DATA_DIR/video2commons") - with open(tooldir + '/../config.json', 'r') as _f: + with open(tooldir + "/../config.json", "r") as _f: _data = _json.load(_f) except IOError as _e: - __import__('logging').exception(_e) + __import__("logging").exception(_e) _data = {} -consumer_key = _data.get('consumer_key') -consumer_secret = _data.get('consumer_secret') -api_url = _data.get('api_url') -redis_pw = _data.get('redis_pw') -redis_host = _data.get('redis_host') -session_key = _data.get('session_key') -http_host = _data.get('http_host') -webfrontend_uri = _data.get('webfrontend_uri') -socketio_uri = _data.get('socketio_uri') -youtube_user = _data.get('youtube_user') -youtube_pass = _data.get('youtube_pass') \ No newline at end of file +consumer_key = _data.get("consumer_key") +consumer_secret = _data.get("consumer_secret") +api_url = _data.get("api_url") +redis_pw = _data.get("redis_pw") +redis_host = _data.get("redis_host") +session_key = _data.get("session_key") +http_host = _data.get("http_host") +webfrontend_uri = _data.get("webfrontend_uri") +socketio_uri = _data.get("socketio_uri") +youtube_user = _data.get("youtube_user") +youtube_pass = _data.get("youtube_pass") diff --git a/video2commons/exceptions.py b/video2commons/exceptions.py index 2838c6cb..8491b5e0 100644 --- a/video2commons/exceptions.py +++ b/video2commons/exceptions.py @@ -49,5 +49,4 @@ class TaskAbort(TaskError): def __init__(self): """Initialize.""" - super().__init__('The task has been aborted.') - + super().__init__("The task has been aborted.") diff --git a/video2commons/frontend/__init__.py b/video2commons/frontend/__init__.py index a0c8a0b7..6eff5d71 100644 --- a/video2commons/frontend/__init__.py +++ b/video2commons/frontend/__init__.py @@ -19,8 +19,6 @@ """videocommons backend.""" - - from video2commons.frontend.app import app -__all__ = ['app'] +__all__ = ["app"] diff --git a/video2commons/frontend/api.py b/video2commons/frontend/api.py index 45943ed6..2e90f7e1 100644 --- a/video2commons/frontend/api.py +++ b/video2commons/frontend/api.py @@ -19,42 +19,44 @@ """video2commons web API.""" - - import json import traceback import re from uuid import uuid4 -from flask import ( - Blueprint, request, session, jsonify, current_app -) +from flask import Blueprint, request, session, jsonify, current_app from video2commons.config import session_key from video2commons.backend import worker from video2commons.frontend.shared import ( - redisconnection, check_banned, generate_csrf_token, redis_publish + redisconnection, + check_banned, + generate_csrf_token, + redis_publish, ) from video2commons.frontend.urlextract import ( - do_extract_url, do_validate_filename_unique, do_validate_youtube_id, make_dummy_desc, - do_validate_filename, do_validate_filedesc, sanitize -) -from video2commons.frontend.upload import ( - upload as _upload, status as _uploadstatus + do_extract_url, + do_validate_filename_unique, + do_validate_youtube_id, + make_dummy_desc, + do_validate_filename, + do_validate_filedesc, + sanitize, ) +from video2commons.frontend.upload import upload as _upload, status as _uploadstatus from video2commons.shared import stats # Adapted from: https://stackoverflow.com/a/19161373 YOUTUBE_REGEX = ( - r'(https?://)?(www\.)?' - r'(youtube|youtu|youtube-nocookie)\.(com|be)/' - r'(watch\?.*?(?=v=)v=|embed/|v/|.+\?v=)?([^&=%\?]{11})' + r"(https?://)?(www\.)?" + r"(youtube|youtu|youtube-nocookie)\.(com|be)/" + r"(watch\?.*?(?=v=)v=|embed/|v/|.+\?v=)?([^&=%\?]{11})" ) -api = Blueprint('api', __name__) +api = Blueprint("api", __name__) @api.errorhandler(Exception) @@ -66,18 +68,20 @@ def all_exception_handler(e): @api.before_request def check_logged_in(): """Error if a user is not logged in.""" - if 'username' not in session and \ - request.headers.get('X-V2C-Session-Bypass') != session_key: - return error_json('Are you logged in?') + if ( + "username" not in session + and request.headers.get("X-V2C-Session-Bypass") != session_key + ): + return error_json("Are you logged in?") @api.before_request def csrf_protect(): """For POSTs, require CSRF token.""" if request.method == "POST": - token = session.get('_csrf_token') - if not token or token != request.form.get('_csrf_token'): - return error_json('Invalid CSRF token. Try reloading this page.') + token = session.get("_csrf_token") + if not token or token != request.form.get("_csrf_token"): + return error_json("Invalid CSRF token. Try reloading this page.") def format_exception(e): @@ -87,7 +91,7 @@ def format_exception(e): if isinstance(e, AssertionError): return desc else: - return f'An exception occurred: {type(e).__name__}: {desc}' + return f"An exception occurred: {type(e).__name__}: {desc}" def error_json(e): @@ -95,36 +99,28 @@ def error_json(e): session.rollback() if isinstance(e, BaseException): return jsonify( - step='error', - error=format_exception(e), - traceback=traceback.format_exc() + step="error", error=format_exception(e), traceback=traceback.format_exc() ) else: - return jsonify( - step='error', - error=e, - traceback=None - ) + return jsonify(step="error", error=e, traceback=None) -@api.route('/csrf') +@api.route("/csrf") def get_csrf(): """Get the CSRF token for API-only access.""" - return jsonify( - csrf=generate_csrf_token() - ) + return jsonify(csrf=generate_csrf_token()) -@api.route('/iosession') +@api.route("/iosession") def get_iosession(): """Get a pointer to session for read-only socket.io notifications.""" iosession = str(uuid4()) - redisconnection.set('iosession:' + iosession, session.sid) - redisconnection.expire('iosession:' + iosession, 60) + redisconnection.set("iosession:" + iosession, session.sid) + redisconnection.expire("iosession:" + iosession, 60) return jsonify(iosession=iosession) -@api.route('/status') +@api.route("/status") def status(): """Get all visible task status for user.""" key, ids = get_tasks() @@ -133,21 +129,16 @@ def status(): values.append(_status(id)) values = [_f for _f in values if _f] - rooms = [t['id'] for t in values] + [key] + rooms = [t["id"] for t in values] + [key] return jsonify( - values=values, - rooms=rooms, - username=session['username'], - stats=get_stats() + values=values, rooms=rooms, username=session["username"], stats=get_stats() ) -@api.route('/status-single') +@api.route("/status-single") def status_single(): """Get the status of one task.""" - return jsonify( - value=_status(request.args['task']) - ) + return jsonify(value=_status(request.args["task"])) def _status(id): @@ -157,216 +148,211 @@ def _status(id): return None res = worker.main.AsyncResult(id) - task = { - 'id': id, - 'title': title, - 'hostname': get_hostname_from_task(id) - } + task = {"id": id, "title": title, "hostname": get_hostname_from_task(id)} try: state = res.state except: - task.update({ - 'status': 'fail', - 'text': 'The status of the task could not be retrieved.', - 'traceback': traceback.format_exc() - }) + task.update( + { + "status": "fail", + "text": "The status of the task could not be retrieved.", + "traceback": traceback.format_exc(), + } + ) else: - if state == 'PENDING': - task.update({ - 'status': 'progress', - 'text': 'Your task is pending...', - 'progress': -1 - }) - elif state == 'PROGRESS': - task.update({ - 'status': 'progress', - 'text': res.result['text'], - 'progress': res.result['percent'] - }) - elif state == 'SUCCESS': + if state == "PENDING": + task.update( + { + "status": "progress", + "text": "Your task is pending...", + "progress": -1, + } + ) + elif state == "PROGRESS": + task.update( + { + "status": "progress", + "text": res.result["text"], + "progress": res.result["percent"], + } + ) + elif state == "SUCCESS": if isinstance(res.result, (list, tuple)): filename, wikifileurl = res.result - task.update({ - 'status': 'done', - 'url': wikifileurl, - 'text': filename - }) + task.update({"status": "done", "url": wikifileurl, "text": filename}) elif isinstance(res.result, dict): - if res.result['type'] == 'done': - task.update({ - 'status': 'done', - 'url': res.result['url'], - 'text': res.result['filename'] - }) - elif res.result['type'] == 'ssu': - task.update({ - 'status': 'needssu', - 'filename': res.result['url'].rsplit('/', 1)[-1], - 'url': res.result['url'], - 'hashsum': res.result['hashsum'] - }) - elif state == 'FAILURE': + if res.result["type"] == "done": + task.update( + { + "status": "done", + "url": res.result["url"], + "text": res.result["filename"], + } + ) + elif res.result["type"] == "ssu": + task.update( + { + "status": "needssu", + "filename": res.result["url"].rsplit("/", 1)[-1], + "url": res.result["url"], + "hashsum": res.result["hashsum"], + } + ) + elif state == "FAILURE": e = res.result if e is False: - task.update({ - 'status': 'fail', - 'text': res.traceback, - 'restartable': True - }) + task.update( + {"status": "fail", "text": res.traceback, "restartable": True} + ) else: - task.update({ - 'status': 'fail', - 'text': format_exception(e), - 'restartable': ( - (not redisconnection.exists('restarted:' + id)) and - redisconnection.exists('params:' + id) - ) - }) - elif state == 'RETRY': - task.update({ - 'status': 'progress', - 'text': 'Your task is being rescheduled...', - 'progress': -1 - }) - elif state == 'ABORTED': - task.update({ - 'status': 'abort', - 'text': 'Your task is being aborted...' - }) + task.update( + { + "status": "fail", + "text": format_exception(e), + "restartable": ( + (not redisconnection.exists("restarted:" + id)) + and redisconnection.exists("params:" + id) + ), + } + ) + elif state == "RETRY": + task.update( + { + "status": "progress", + "text": "Your task is being rescheduled...", + "progress": -1, + } + ) + elif state == "ABORTED": + task.update({"status": "abort", "text": "Your task is being aborted..."}) else: - task.update({ - 'status': 'fail', - 'text': ( - 'This task is in an unknown state. Please file an issue ' - 'in GitHub: ' - ), - 'url': 'https://github.com/toolforge/video2commons/issues' - }) + task.update( + { + "status": "fail", + "text": ( + "This task is in an unknown state. Please file an issue " + "in GitHub: " + ), + "url": "https://github.com/toolforge/video2commons/issues", + } + ) return task def is_sudoer(username): """Check if a user is a sudoer.""" - return username in redisconnection.lrange('sudoers', 0, -1) + return username in redisconnection.lrange("sudoers", 0, -1) def get_tasks(): """Get a list of visible tasks for user.""" # sudoer = able to monitor all tasks - username = session['username'] - if session.get('is_maintainer'): - key = 'alltasks' + username = session["username"] + if session.get("is_maintainer"): + key = "alltasks" else: - key = 'tasks:' + username + key = "tasks:" + username return key, redisconnection.lrange(key, 0, -1)[::-1] def get_stats(): """Get worker stats from Redis.""" - stats = redisconnection.get('stats') + stats = redisconnection.get("stats") return json.loads(stats) if stats else None def get_title_from_task(id): """Get task title from task ID.""" - return redisconnection.get('titles:' + id) + return redisconnection.get("titles:" + id) def get_hostname_from_task(id): """Get the hostname of the worker processing a task from task ID.""" - hostname = redisconnection.get('tasklock:' + id) + hostname = redisconnection.get("tasklock:" + id) # Old tasks don't have a hostname as the value in tasklock and store the # literal 'T' instead. Reinterpret these values as null. - if hostname == 'T': + if hostname == "T": hostname = None return hostname -@api.route('/extracturl', methods=['POST']) +@api.route("/extracturl", methods=["POST"]) def extract_url(): """Extract a video url.""" - url = request.form['url'] + url = request.form["url"] return jsonify(**do_extract_url(url)) -@api.route('/makedesc', methods=['POST']) +@api.route("/makedesc", methods=["POST"]) def make_desc(): """Create a (mostly-empty) description.""" - filename = request.form['filename'] + filename = request.form["filename"] return jsonify(**make_dummy_desc(filename)) -@api.route('/listformats', methods=['POST']) +@api.route("/listformats", methods=["POST"]) def list_formats(): """List the possible convert formats from a given audio/video pair.""" formats = [] - prefer = '' - video = _boolize(request.form['video']) - audio = _boolize(request.form['audio']) + prefer = "" + video = _boolize(request.form["video"]) + audio = _boolize(request.form["audio"]) if video: if audio: - formats = ['ogv (Theora/Vorbis)', 'webm (VP8/Vorbis)', - 'webm (VP9/Opus)', 'webm (AV1/Opus)'] - prefer = 'webm (AV1/Opus)' + formats = [ + "ogv (Theora/Vorbis)", + "webm (VP8/Vorbis)", + "webm (VP9/Opus)", + "webm (AV1/Opus)", + ] + prefer = "webm (AV1/Opus)" else: - formats = ['ogv (Theora)', 'webm (VP8)', - 'webm (VP9)', 'webm (AV1)'] - prefer = 'webm (AV1)' + formats = ["ogv (Theora)", "webm (VP8)", "webm (VP9)", "webm (AV1)"] + prefer = "webm (AV1)" else: if audio: - formats = ['ogg (Vorbis)', 'opus (Opus)'] - prefer = 'ogg (Vorbis)' + formats = ["ogg (Vorbis)", "opus (Opus)"] + prefer = "ogg (Vorbis)" else: - raise RuntimeError('Either video or audio must be kept') + raise RuntimeError("Either video or audio must be kept") - return jsonify( - audio=audio, - video=video, - format=prefer, - formats=formats - ) + return jsonify(audio=audio, video=video, format=prefer, formats=formats) def _boolize(data): - return data in [True, 'true', 'TRUE', 'True', 1, '1'] + return data in [True, "true", "TRUE", "True", 1, "1"] -@api.route('/validatefilename', methods=['POST']) +@api.route("/validatefilename", methods=["POST"]) def validate_filename(): """Validate filename for invalid characters/parts.""" - return jsonify( - filename=do_validate_filename(request.form['filename']) - ) + return jsonify(filename=do_validate_filename(request.form["filename"])) -@api.route('/validatefiledesc', methods=['POST']) +@api.route("/validatefiledesc", methods=["POST"]) def validate_filedesc(): """Validate filename for invalid characters/parts.""" - return jsonify( - filedesc=do_validate_filedesc(request.form['filedesc']) - ) + return jsonify(filedesc=do_validate_filedesc(request.form["filedesc"])) -@api.route('/validatefilenameunique', methods=['POST']) +@api.route("/validatefilenameunique", methods=["POST"]) def validate_filename_unique(): """Validate filename isn't already in use on the wiki.""" - return jsonify( - filename=do_validate_filename_unique(request.form['filename']) - ) + return jsonify(filename=do_validate_filename_unique(request.form["filename"])) -@api.route('/validateurl', methods=['POST']) +@api.route("/validateurl", methods=["POST"]) def validate_url(): """Validate that a video belonging to a URL is not already on the wiki.""" - url = request.form['url'] + url = request.form["url"] # Check if the URL is a YouTube URL, and if so, extract the ID and validate # that it doesn't already exist on Commons. @@ -377,8 +363,7 @@ def validate_url(): return jsonify(entity_url=do_validate_youtube_id(youtube_id)) except Exception as e: current_app.logger.error( - f'Error validating YouTube URL "{url}": {e}\n\n' - f'{traceback.format_exc()}' + f'Error validating YouTube URL "{url}": {e}\n\n{traceback.format_exc()}' ) # Skip validation if errors are encountered, e.g. SPARQL is down. @@ -392,170 +377,170 @@ def get_backend_keys(format): MAXSIZE = 5 << 30 COMBINED_FMT = ( - 'bestvideo[filesize<{max}]+' - 'bestaudio[acodec={{acodec}}]/' - 'bestvideo[filesize<{max}]+' - 'bestaudio[ext={{aext}}]/' - 'bestvideo+bestaudio/best' - ).format(max=MAXSIZE) - VIDEO_FMT = ( - 'bestvideo[filesize<{max}]/' - 'bestvideo/best' + "bestvideo[filesize<{max}]+" + "bestaudio[acodec={{acodec}}]/" + "bestvideo[filesize<{max}]+" + "bestaudio[ext={{aext}}]/" + "bestvideo+bestaudio/best" ).format(max=MAXSIZE) + VIDEO_FMT = ("bestvideo[filesize<{max}]/bestvideo/best").format(max=MAXSIZE) AUDIO_FMT = ( - 'bestaudio[acodec={{acodec}}]/' - 'bestaudio[ext={{aext}}]/' - 'bestaudio/best' - ).format(max=MAXSIZE) + "bestaudio[acodec={{acodec}}]/bestaudio[ext={{aext}}]/bestaudio/best" + ).format() return { - 'ogv (Theora)': - (VIDEO_FMT.format(vcodec='theora', vext='ogv'), 'an.ogv'), - 'webm (VP8)': - (VIDEO_FMT.format(vcodec='vp8', vext='webm'), 'an.webm'), - 'webm (VP9)': - (VIDEO_FMT.format(vcodec='vp9', vext='webm'), 'an.vp9.webm'), - 'webm (AV1)': - (VIDEO_FMT.format(vcodec='av1', vext='webm'), 'an.av1.webm'), - 'ogg (Vorbis)': - (AUDIO_FMT.format(acodec='vorbis', aext='ogg'), 'ogg'), - 'opus (Opus)': - (AUDIO_FMT.format(acodec='opus', aext='opus'), 'opus'), - 'ogv (Theora/Vorbis)': - (COMBINED_FMT.format( - vcodec='theora', vext='ogv', acodec='vorbis', aext='ogg'), - 'ogv'), - 'webm (VP8/Vorbis)': - (COMBINED_FMT.format( - vcodec='vp8', vext='webm', acodec='vorbis', aext='ogg'), - 'webm'), - 'webm (VP9/Opus)': - (COMBINED_FMT.format( - vcodec='vp9', vext='webm', acodec='opus', aext='webm'), - 'vp9.webm'), - 'webm (AV1/Opus)': - (COMBINED_FMT.format( - vcodec='av1', vext='webm', acodec='opus', aext='webm'), - 'av1.webm'), + "ogv (Theora)": (VIDEO_FMT.format(vcodec="theora", vext="ogv"), "an.ogv"), + "webm (VP8)": (VIDEO_FMT.format(vcodec="vp8", vext="webm"), "an.webm"), + "webm (VP9)": (VIDEO_FMT.format(vcodec="vp9", vext="webm"), "an.vp9.webm"), + "webm (AV1)": (VIDEO_FMT.format(vcodec="av1", vext="webm"), "an.av1.webm"), + "ogg (Vorbis)": (AUDIO_FMT.format(acodec="vorbis", aext="ogg"), "ogg"), + "opus (Opus)": (AUDIO_FMT.format(acodec="opus", aext="opus"), "opus"), + "ogv (Theora/Vorbis)": ( + COMBINED_FMT.format( + vcodec="theora", vext="ogv", acodec="vorbis", aext="ogg" + ), + "ogv", + ), + "webm (VP8/Vorbis)": ( + COMBINED_FMT.format(vcodec="vp8", vext="webm", acodec="vorbis", aext="ogg"), + "webm", + ), + "webm (VP9/Opus)": ( + COMBINED_FMT.format(vcodec="vp9", vext="webm", acodec="opus", aext="webm"), + "vp9.webm", + ), + "webm (AV1/Opus)": ( + COMBINED_FMT.format(vcodec="av1", vext="webm", acodec="opus", aext="webm"), + "av1.webm", + ), }[format] -@api.route('/task/run', methods=['POST']) +@api.route("/task/run", methods=["POST"]) def run_task(): """Run a task with parameters from session.""" - url = request.form['url'] - ie_key = request.form['extractor'] - subtitles = request.form['subtitles'] - filename = sanitize(request.form['filename']) - filedesc = request.form['filedesc'] - downloadkey, convertkey = get_backend_keys(request.form['format']) - username = session['username'] - oauth = (session['access_token_key'], session['access_token_secret']) - - taskid = run_task_internal(filename, ( - url, ie_key, subtitles, filename, filedesc, - downloadkey, convertkey, username, oauth - )) + url = request.form["url"] + ie_key = request.form["extractor"] + subtitles = request.form["subtitles"] + filename = sanitize(request.form["filename"]) + filedesc = request.form["filedesc"] + downloadkey, convertkey = get_backend_keys(request.form["format"]) + username = session["username"] + oauth = (session["access_token_key"], session["access_token_secret"]) + + taskid = run_task_internal( + filename, + ( + url, + ie_key, + subtitles, + filename, + filedesc, + downloadkey, + convertkey, + username, + oauth, + ), + ) - return jsonify(id=taskid, step='success') + return jsonify(id=taskid, step="success") def run_task_internal(filename, params): """Internal run task function to accept whatever params given.""" banned = check_banned() - assert not banned, 'You are banned from using this tool! Reason: ' + banned + assert not banned, "You are banned from using this tool! Reason: " + banned res = worker.main.delay(*params) taskid = res.id expire = 14 * 24 * 3600 # 2 weeks - redisconnection.lpush('alltasks', taskid) - redisconnection.expire('alltasks', expire) - redisconnection.lpush('tasks:' + session['username'], taskid) - redisconnection.expire('tasks:' + session['username'], expire) - redisconnection.set('titles:' + taskid, filename) - redisconnection.expire('titles:' + taskid, expire) - redisconnection.set('params:' + taskid, json.dumps(params)) - redisconnection.expire('params:' + taskid, expire) + redisconnection.lpush("alltasks", taskid) + redisconnection.expire("alltasks", expire) + redisconnection.lpush("tasks:" + session["username"], taskid) + redisconnection.expire("tasks:" + session["username"], expire) + redisconnection.set("titles:" + taskid, filename) + redisconnection.expire("titles:" + taskid, expire) + redisconnection.set("params:" + taskid, json.dumps(params)) + redisconnection.expire("params:" + taskid, expire) try: stats.increment_queue_counter(redisconnection) except Exception: pass # We don't want to fail the API call if we can't update stats. - redis_publish('add', {'taskid': taskid, 'user': session['username']}) - redis_publish('update', {'taskid': taskid, 'data': _status(taskid)}) + redis_publish("add", {"taskid": taskid, "user": session["username"]}) + redis_publish("update", {"taskid": taskid, "data": _status(taskid)}) return taskid -@api.route('/task/restart', methods=['POST']) +@api.route("/task/restart", methods=["POST"]) def restart_task(): """Reastart a task: run a task with params of another task.""" - id = request.form['id'] + id = request.form["id"] - filename = redisconnection.get('titles:' + id) - assert filename, 'Task does not exist' - if not session.get('is_maintainer'): - assert id in \ - redisconnection.lrange('tasks:' + session['username'], 0, -1), \ - 'Task must belong to you.' + filename = redisconnection.get("titles:" + id) + assert filename, "Task does not exist" + if not session.get("is_maintainer"): + assert id in redisconnection.lrange("tasks:" + session["username"], 0, -1), ( + "Task must belong to you." + ) - restarted = redisconnection.get('restarted:' + id) - assert not restarted, \ - 'Task has already been restarted with id ' + restarted - params = redisconnection.get('params:' + id) - assert params, 'Could not extract the task parameters.' + restarted = redisconnection.get("restarted:" + id) + assert not restarted, "Task has already been restarted with id " + restarted + params = redisconnection.get("params:" + id) + assert params, "Could not extract the task parameters." newid = run_task_internal(filename, json.loads(params)) - redisconnection.set('restarted:' + id, newid) + redisconnection.set("restarted:" + id, newid) - redis_publish('update', {'taskid': id, 'data': _status(id)}) + redis_publish("update", {"taskid": id, "data": _status(id)}) - return jsonify(restart='success', id=id, taskid=newid) + return jsonify(restart="success", id=id, taskid=newid) -@api.route('/task/remove', methods=['POST']) +@api.route("/task/remove", methods=["POST"]) def remove_task(): """Revove a task from list of tasks.""" - id = request.form['id'] - username = session['username'] - if not session.get('is_maintainer'): - assert id in \ - redisconnection.lrange('tasks:' + username, 0, -1), \ - 'Task must belong to you.' - redisconnection.lrem('alltasks', 0, id) - redisconnection.lrem('tasks:' + username, 0, id) - redisconnection.delete('titles:' + id) - redisconnection.delete('params:' + id) - redisconnection.delete('restarted:' + id) + id = request.form["id"] + username = session["username"] + if not session.get("is_maintainer"): + assert id in redisconnection.lrange("tasks:" + username, 0, -1), ( + "Task must belong to you." + ) + redisconnection.lrem("alltasks", 0, id) + redisconnection.lrem("tasks:" + username, 0, id) + redisconnection.delete("titles:" + id) + redisconnection.delete("params:" + id) + redisconnection.delete("restarted:" + id) - redis_publish('remove', {'taskid': id}) + redis_publish("remove", {"taskid": id}) - return jsonify(remove='success', id=id) + return jsonify(remove="success", id=id) -@api.route('/task/abort', methods=['POST']) +@api.route("/task/abort", methods=["POST"]) def abort_task(): """Abort a task.""" - id = request.form['id'] - username = session['username'] - if not session.get('is_maintainer'): - assert id in \ - redisconnection.lrange('tasks:' + username, 0, -1), \ - 'Task must belong to you.' + id = request.form["id"] + username = session["username"] + if not session.get("is_maintainer"): + assert id in redisconnection.lrange("tasks:" + username, 0, -1), ( + "Task must belong to you." + ) worker.main.AsyncResult(id).abort() - redis_publish('update', {'taskid': id, 'data': _status(id)}) + redis_publish("update", {"taskid": id, "data": _status(id)}) - return jsonify(remove='success', id=id) + return jsonify(remove="success", id=id) # No nested blueprints in flask; we have to do this :( -@api.route('/upload/upload', methods=['POST']) +@api.route("/upload/upload", methods=["POST"]) def upload(): return _upload() -@api.route('/upload/status', methods=['POST']) +@api.route("/upload/status", methods=["POST"]) def uploadstatus(): return _uploadstatus() diff --git a/video2commons/frontend/app.py b/video2commons/frontend/app.py index e32ef382..29866850 100644 --- a/video2commons/frontend/app.py +++ b/video2commons/frontend/app.py @@ -19,32 +19,35 @@ """video2commons web frontend.""" - - import json import logging import traceback from urllib.parse import urlparse, urljoin -from flask import ( - Flask, request, Response, session, render_template, redirect, url_for -) +from flask import Flask, request, Response, session, render_template, redirect, url_for from mwoauth import AccessToken, ConsumerToken, RequestToken, Handshaker from requests_oauthlib import OAuth1 import requests from video2commons.config import ( - consumer_key, consumer_secret, api_url, webfrontend_uri, socketio_uri + consumer_key, + consumer_secret, + api_url, + webfrontend_uri, + socketio_uri, ) from video2commons.frontend.redisession import RedisSessionInterface from video2commons.frontend.shared import redisconnection, check_banned from video2commons.frontend.api import api, is_sudoer from video2commons.frontend.i18n import ( - i18nblueprint, translate as _, getlanguage, is_rtl + i18nblueprint, + translate as _, + getlanguage, + is_rtl, ) -ISSUE_URL = 'https://github.com/toolforge/video2commons/issues' +ISSUE_URL = "https://github.com/toolforge/video2commons/issues" consumer_token = ConsumerToken(consumer_key, consumer_secret) handshaker = Handshaker(api_url, consumer_token) @@ -53,23 +56,23 @@ app.logger.setLevel(logging.INFO) -app.session_cookie_name = 'v2c-session' +app.session_cookie_name = "v2c-session" app.session_interface = RedisSessionInterface(redisconnection) -app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 3600 +app.config["SEND_FILE_MAX_AGE_DEFAULT"] = 3600 config_p = { - 'webfrontend_uri': webfrontend_uri, - 'socketio_uri': socketio_uri, + "webfrontend_uri": webfrontend_uri, + "socketio_uri": socketio_uri, } -app.jinja_env.globals['config'] = config_p -app.jinja_env.globals['_'] = _ -app.jinja_env.globals['lang'] = getlanguage -app.jinja_env.tests['rtl'] = is_rtl +app.jinja_env.globals["config"] = config_p +app.jinja_env.globals["_"] = _ +app.jinja_env.globals["lang"] = getlanguage +app.jinja_env.tests["rtl"] = is_rtl -app.register_blueprint(api, url_prefix='/api') -app.register_blueprint(i18nblueprint, url_prefix='/i18n') +app.register_blueprint(api, url_prefix="/api") +app.register_blueprint(i18nblueprint, url_prefix="/i18n") @app.errorhandler(Exception) @@ -80,25 +83,24 @@ def all_exception_handler(e): try: message = ( - f'Please file an issue with this error in GitHub: ' - f'{issue_link}

' + f"Please file an issue with this error in GitHub: {issue_link}

" ) - loggedin = 'username' in session + loggedin = "username" in session stacktrace = traceback.format_exc() except: message = ( - f'Something went terribly wrong, ' - f'and we failed to find the cause automatically. ' - f'Please file an issue in GitHub: {issue_link}' + f"Something went terribly wrong, " + f"and we failed to find the cause automatically. " + f"Please file an issue in GitHub: {issue_link}" ) loggedin = False try: return render_template( - 'error.min.html', + "error.min.html", html_message=message, stacktrace=stacktrace, - loggedin=loggedin + loggedin=loggedin, ), 500 except: return message, 500 @@ -107,65 +109,56 @@ def all_exception_handler(e): @app.before_request def force_https(): """Force user to redirect to https, checking X-Forwarded-Proto.""" - if request.headers.get('X-Forwarded-Proto') == 'http': - return redirect('https://' + request.headers['Host'] + - request.headers['X-Original-URI'], - code=301) + if request.headers.get("X-Forwarded-Proto") == "http": + return redirect( + "https://" + request.headers["Host"] + request.headers["X-Original-URI"], + code=301, + ) -@app.route('/config') +@app.route("/config") def get_config(): """Get the current config as a dict and output Javascript.""" - data = 'window.config=' + json.dumps(config_p) + ';' - return Response(data, mimetype='application/javascript; charset=utf-8') + data = "window.config=" + json.dumps(config_p) + ";" + return Response(data, mimetype="application/javascript; charset=utf-8") -@app.route('/') +@app.route("/") def main(): """Main page.""" banned = check_banned() if banned: return render_template( - 'error.min.html', - message='You are banned from using this tool! Reason: ' + banned, - loggedin=False + "error.min.html", + message="You are banned from using this tool! Reason: " + banned, + loggedin=False, ) try: auth = dologin() - session['language'] = querylanguage(auth) + session["language"] = querylanguage(auth) except: # SECURITY: If we cannot login, the session is invalid. app.session_interface.abandon_session(app, session) - return render_template( - 'main.min.html', - loggedin=False - ) + return render_template("main.min.html", loggedin=False) - return render_template( - 'main.min.html', - loggedin=True - ) + return render_template("main.min.html", loggedin=True) def dologin(): """Attempt to login.""" - if not ( - 'access_token_key' in session and - 'access_token_secret' in session - ): + if not ("access_token_key" in session and "access_token_secret" in session): raise NameError("No access keys") access_token = AccessToken( - session['access_token_key'], - session['access_token_secret'] + session["access_token_key"], session["access_token_secret"] ) - session['username'] = handshaker.identify(access_token)['username'] + session["username"] = handshaker.identify(access_token)["username"] auth = OAuth1( client_key=consumer_token.key, client_secret=consumer_token.secret, resource_owner_key=access_token.key, - resource_owner_secret=access_token.secret + resource_owner_secret=access_token.secret, ) return auth @@ -173,21 +166,21 @@ def dologin(): def querylanguage(auth): """Query user's language that's available on v2c.""" - default = 'en' + default = "en" r = requests.post( - url=api_url.replace('index.php', 'api.php'), + url=api_url.replace("index.php", "api.php"), data={ - 'action': 'query', - 'format': 'json', - 'meta': 'userinfo', - 'uiprop': 'options' + "action": "query", + "format": "json", + "meta": "userinfo", + "uiprop": "options", }, - auth=auth + auth=auth, ) try: - language = r.json()['query']['userinfo']['options']['language'] + language = r.json()["query"]["userinfo"]["options"]["language"] except (NameError, KeyError): return default @@ -197,71 +190,74 @@ def querylanguage(auth): return language -@app.route('/oauthinit') +@app.route("/oauthinit") def loginredirect(): """Initialize OAuth login.""" app.session_interface.abandon_session(app, session) redirecturl, request_token = handshaker.initiate() - session['request_token_key'], session['request_token_secret'] = \ - request_token.key, request_token.secret - session['return_to_url'] = url_for('main') + session["request_token_key"], session["request_token_secret"] = ( + request_token.key, + request_token.secret, + ) + session["return_to_url"] = url_for("main") - returnto = request.args.get('returnto') + returnto = request.args.get("returnto") if returnto: ref_url = urlparse(request.url_root) test_url = urlparse(urljoin(request.host_url, returnto)) if ( - test_url.scheme == ref_url.scheme and - test_url.netloc == ref_url.netloc and - test_url.path.startswith(ref_url.path) + test_url.scheme == ref_url.scheme + and test_url.netloc == ref_url.netloc + and test_url.path.startswith(ref_url.path) ): - session['return_to_url'] = returnto + session["return_to_url"] = returnto return redirect(redirecturl) -@app.route('/oauthcallback') +@app.route("/oauthcallback") def logincallback(): """Finialize OAuth login.""" request_token = RequestToken( - session['request_token_key'], - session['request_token_secret'] + session["request_token_key"], session["request_token_secret"] ) access_token = handshaker.complete(request_token, request.query_string) - session.pop('access_token_key', None) - session.pop('access_token_secret', None) - session.pop('username', None) + session.pop("access_token_key", None) + session.pop("access_token_secret", None) + session.pop("username", None) identify = handshaker.identify(access_token) - is_contributor = identify['editcount'] >= 50 - is_maintainer = is_sudoer(identify['username']) - is_autoconfirmed = 'autoconfirmed' in identify['rights'] + is_contributor = identify["editcount"] >= 50 + is_maintainer = is_sudoer(identify["username"]) + is_autoconfirmed = "autoconfirmed" in identify["rights"] # Only allow autoconfirmed users either with at least 50 edits or # maintainer status to use this tool. if not (is_autoconfirmed and (is_contributor or is_maintainer)): return render_template( - 'error.min.html', - message='You must be an autoconfirmed Commons user ' - 'with at least 50 edits to use this tool.', - loggedin=True + "error.min.html", + message="You must be an autoconfirmed Commons user " + "with at least 50 edits to use this tool.", + loggedin=True, ) - session['access_token_key'], session['access_token_secret'] = \ - access_token.key, access_token.secret + session["access_token_key"], session["access_token_secret"] = ( + access_token.key, + access_token.secret, + ) - session['username'] = identify['username'] - session['is_maintainer'] = is_maintainer + session["username"] = identify["username"] + session["is_maintainer"] = is_maintainer - return redirect(session.get('return_to_url', url_for('main'))) + return redirect(session.get("return_to_url", url_for("main"))) -@app.route('/logout') +@app.route("/logout") def logout(): """Logout: clear all session data.""" session.clear() - return redirect(url_for('main')) + return redirect(url_for("main")) diff --git a/video2commons/frontend/i18n.py b/video2commons/frontend/i18n.py index 52b96cb8..8d5d7336 100644 --- a/video2commons/frontend/i18n.py +++ b/video2commons/frontend/i18n.py @@ -19,15 +19,13 @@ """video2commons web i18n module.""" - - import os import json from flask import Blueprint, Response, request, session, g from video2commons.frontend.shared import redisconnection -i18nblueprint = Blueprint('i18n', __name__) +i18nblueprint = Blueprint("i18n", __name__) _d = os.path.dirname(os.path.realpath(__file__)) @@ -39,16 +37,16 @@ def max_age(response): return response -@i18nblueprint.route('/') +@i18nblueprint.route("/") def urlget(lang): """Get the i18n of language lang and output Javascript.""" - data = 'window.i18n=' + json.dumps(get(lang)) + ';' - return Response(data, mimetype='application/javascript; charset=utf-8') + data = "window.i18n=" + json.dumps(get(lang)) + ";" + return Response(data, mimetype="application/javascript; charset=utf-8") def get(lang): """Get the i18n of language lang and output dict.""" - i18nkey = 'i18n:' + lang + i18nkey = "i18n:" + lang gval = g.get(i18nkey, None) if gval: return gval @@ -58,8 +56,8 @@ def get(lang): data = {} fallbacklist = _create_fallback(lang) datafiles = _loadi18nfiles(fallbacklist) - for key in datafiles['en']: - if key == '@metadata': + for key in datafiles["en"]: + if key == "@metadata": # @metadata is a dict not a string continue @@ -70,11 +68,11 @@ def get(lang): # if the translation breaks due to double escaping, # oh well, why are you hacking this tool? # --XSS prevention - data[key] = data[key].replace('<', '<') - data[key] = data[key].replace('>', '>') + data[key] = data[key].replace("<", "<") + data[key] = data[key].replace(">", ">") break - data['@lang'] = lang - data['@dir'] = _dir(lang) + data["@lang"] = lang + data["@dir"] = _dir(lang) setattr(g, i18nkey, data) redisconnection.setex(i18nkey, 60, json.dumps(data)) @@ -85,40 +83,40 @@ def _loadi18nfiles(fallbacklist): datafiles = {} for code in fallbacklist: if code not in datafiles: - path = _d + '/i18n/' + code + '.json' + path = _d + "/i18n/" + code + ".json" if os.path.isfile(path): - with open(path, 'r') as f: + with open(path, "r") as f: datafiles[code] = json.load(f) return datafiles def _create_fallback(lang): - fallbacks = _loadmetadatafile('fallbacks').get(lang, []) + fallbacks = _loadmetadatafile("fallbacks").get(lang, []) fallbacks = fallbacks if isinstance(fallbacks, list) else [fallbacks] - return [lang] + fallbacks + ['en'] + return [lang] + fallbacks + ["en"] def translate(key): """Translate a key in user language.""" - return get(getlanguage()).get(key, '<' + key + '>') + return get(getlanguage()).get(key, "<" + key + ">") def getlanguage(): """Get the user language.""" - gval = g.get('language', None) + gval = g.get("language", None) if gval: return gval for lang in [ - request.form.get('uselang'), - request.args.get('uselang'), - session.get('language'), + request.form.get("uselang"), + request.args.get("uselang"), + session.get("language"), request.accept_languages.best, ]: if lang and _islang(lang): break else: - lang = 'en' + lang = "en" g.language = lang @@ -126,13 +124,13 @@ def getlanguage(): def _loadmetadatafile(metadata): - key = 'i18nmeta-' + metadata + key = "i18nmeta-" + metadata gval = g.get(key, None) if gval: return gval - path = _d + '/i18n-metadata/' + metadata + '.json' - with open(path, 'r') as f: + path = _d + "/i18n-metadata/" + metadata + ".json" + with open(path, "r") as f: data = json.load(f) setattr(g, key, data) @@ -140,13 +138,13 @@ def _loadmetadatafile(metadata): def _islang(lang): - return lang in _loadmetadatafile('alllangs') + return lang in _loadmetadatafile("alllangs") def _dir(lang): - return 'rtl' if lang in _loadmetadatafile('rtl') else 'ltr' + return "rtl" if lang in _loadmetadatafile("rtl") else "ltr" def is_rtl(lang): """Jinja2 test for rtl-ness.""" - return get(lang).get('@dir') == 'rtl' + return get(lang).get("@dir") == "rtl" diff --git a/video2commons/frontend/redisession.py b/video2commons/frontend/redisession.py index 4ab899b1..7fdbda89 100644 --- a/video2commons/frontend/redisession.py +++ b/video2commons/frontend/redisession.py @@ -44,7 +44,7 @@ class RedisSessionInterface(SessionInterface): serializer = json session_class = RedisSession - def __init__(self, redis=None, prefix='session:'): + def __init__(self, redis=None, prefix="session:"): """Initialize the instance.""" if redis is None: redis = Redis() @@ -83,28 +83,37 @@ def open_session(self, app, request): def save_session(self, app, session, response): """Save session to Redis.""" domain = self.get_cookie_domain(app) - path = url_for('main', _external=False) + path = url_for("main", _external=False) if session is None: return elif not session: self.redis.delete(self.prefix + session.sid) if session.modified: - response.delete_cookie(app.session_cookie_name, - domain=domain, path=path) + response.delete_cookie( + app.session_cookie_name, domain=domain, path=path + ) else: redis_exp = self.get_redis_expiration_time(app, session) cookie_exp = self.get_expiration_time(app, session) if session.modified: val = self.serializer.dumps(dict(session)) - self.redis.setex(self.prefix + session.sid, - int(redis_exp.total_seconds()), val) + self.redis.setex( + self.prefix + session.sid, int(redis_exp.total_seconds()), val + ) else: - self.redis.expire(self.prefix + session.sid, - int(redis_exp.total_seconds())) - response.set_cookie(app.session_cookie_name, session.sid, - expires=cookie_exp, httponly=True, - domain=domain, path=path, secure=True) + self.redis.expire( + self.prefix + session.sid, int(redis_exp.total_seconds()) + ) + response.set_cookie( + app.session_cookie_name, + session.sid, + expires=cookie_exp, + httponly=True, + domain=domain, + path=path, + secure=True, + ) def abandon_session(self, app, session): """Delete the session from redis, empty it, and reinit.""" diff --git a/video2commons/frontend/shared.py b/video2commons/frontend/shared.py index 4400db54..3cfe5fd3 100644 --- a/video2commons/frontend/shared.py +++ b/video2commons/frontend/shared.py @@ -19,8 +19,6 @@ """video2commons web shared.""" - - import json from uuid import uuid4 @@ -29,8 +27,7 @@ from video2commons.config import redis_pw, redis_host -redisconnection = Redis(host=redis_host, db=3, password=redis_pw, - decode_responses=True) +redisconnection = Redis(host=redis_host, db=3, password=redis_pw, decode_responses=True) def check_banned(): @@ -40,10 +37,10 @@ def check_banned(): def generate_csrf_token(): """Generate a CSRF token.""" - if '_csrf_token' not in session: - session['_csrf_token'] = str(uuid4()) - return session['_csrf_token'] + if "_csrf_token" not in session: + session["_csrf_token"] = str(uuid4()) + return session["_csrf_token"] def redis_publish(typ, data): - redisconnection.publish('v2cnotif:'+typ, json.dumps(data)) + redisconnection.publish("v2cnotif:" + typ, json.dumps(data)) diff --git a/video2commons/frontend/upload.py b/video2commons/frontend/upload.py index a51ae8c2..9bb92f2e 100644 --- a/video2commons/frontend/upload.py +++ b/video2commons/frontend/upload.py @@ -18,7 +18,6 @@ # - import os import re import uuid @@ -26,8 +25,8 @@ from flask import request, jsonify -RE_CONTENT_RANGE = re.compile(r'^bytes (\d+)-(\d+)/(\d+)$') -RE_ALLOWED_FILEKEYS = re.compile(r'^[a-zA-Z0-9-]+$') +RE_CONTENT_RANGE = re.compile(r"^bytes (\d+)-(\d+)/(\d+)$") +RE_ALLOWED_FILEKEYS = re.compile(r"^[a-zA-Z0-9-]+$") class WrongOffset(Exception): @@ -37,8 +36,9 @@ def __init__(self, offset): def getpath(digest): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), - 'static/uploads', digest) + return os.path.join( + os.path.dirname(os.path.realpath(__file__)), "static/uploads", digest + ) def stat(permpath): @@ -47,42 +47,43 @@ def stat(permpath): # Flask endpoint def upload(): - f = request.files['file'] + f = request.files["file"] assert f, "Where's my file?" - filekey = request.form.get('filekey') or str(uuid.uuid1()) - assert RE_ALLOWED_FILEKEYS.match('filekey'), 'Unacceptable file key' + filekey = request.form.get("filekey") or str(uuid.uuid1()) + assert RE_ALLOWED_FILEKEYS.match("filekey"), "Unacceptable file key" permpath = getpath(filekey) - content_range = (f.headers.get('Content-Range') or - request.headers.get('Content-Range')) + content_range = f.headers.get("Content-Range") or request.headers.get( + "Content-Range" + ) if content_range: result, kwargs = handle_chunked(f, permpath, content_range) else: result, kwargs = handle_full(f, permpath) - kwargs['filekey'] = filekey + kwargs["filekey"] = filekey return jsonify(result=result, **kwargs) # Flask endpoint def status(): - permpath = getpath(request.form['filekey']) + permpath = getpath(request.form["filekey"]) return jsonify(offset=stat(permpath)) def handle_full(f, permpath): f.save(permpath) - return 'Success', {} + return "Success", {} def handle_chunked(f, permpath, content_range): try: content_range = RE_CONTENT_RANGE.match(content_range) - assert content_range, 'Invalid content range!' + assert content_range, "Invalid content range!" cr1, cr2, cr3 = [int(content_range.group(i)) for i in range(1, 4)] @@ -94,7 +95,7 @@ def handle_chunked(f, permpath, content_range): if size != cr1: raise WrongOffset(size) - with open(permpath, 'ab') as dest: + with open(permpath, "ab") as dest: shutil.copyfileobj(f, dest) except WrongOffset as e: @@ -102,8 +103,9 @@ def handle_chunked(f, permpath, content_range): else: size = stat(permpath) if size < cr3: - return 'Continue', {'offset': size} + return "Continue", {"offset": size} elif size > cr3: - raise RuntimeError('What?! Uploaded file is larger than ' - 'what it is supposed to be?') - return 'Success', {} + raise RuntimeError( + "What?! Uploaded file is larger than what it is supposed to be?" + ) + return "Success", {} diff --git a/video2commons/frontend/urlextract.py b/video2commons/frontend/urlextract.py index 1251eeaf..e32a32a7 100644 --- a/video2commons/frontend/urlextract.py +++ b/video2commons/frontend/urlextract.py @@ -21,10 +21,7 @@ from collections import OrderedDict from video2commons.backend.encode.transcode import WebVideoTranscode -from video2commons.config import ( - tooldir, youtube_user, youtube_pass, consumer_key, consumer_secret -) -from pywikibot.data import sparql +from video2commons.config import tooldir, youtube_user, youtube_pass import re import emoji @@ -37,9 +34,9 @@ SITE = pywikibot.Site() # File extensions are probably alphanumeric with 0 to 4 chars -RE_EXTENSION = re.compile(r'^[a-z0-9]{0,4}$', re.IGNORECASE) +RE_EXTENSION = re.compile(r"^[a-z0-9]{0,4}$", re.IGNORECASE) -DEFAULT_LICENSE = '{{subst:nld|}}' +DEFAULT_LICENSE = "{{subst:nld|}}" FILEDESC_TEMPLATE = """ =={{int:filedesc}}== {{Information @@ -71,212 +68,214 @@ def make_dummy_desc(filename): filedesc = FILEDESC_TEMPLATE % { - 'desc': '', - 'date': '', - 'source': '', - 'uploader': '', - 'license': DEFAULT_LICENSE + "desc": "", + "date": "", + "source": "", + "uploader": "", + "license": DEFAULT_LICENSE, } # Remove the extension - filename = filename.rsplit('.', 1) + filename = filename.rsplit(".", 1) if len(filename) == 1 or RE_EXTENSION.match(filename[1]): filename = filename[0] else: - filename = '.'.join(filename) + filename = ".".join(filename) return { - 'extractor': '(uploads)', - 'filedesc': filedesc.strip(), - 'filename': sanitize(filename) + "extractor": "(uploads)", + "filedesc": filedesc.strip(), + "filename": sanitize(filename), } def do_extract_url(url): """Extract a video url.""" params = { - 'format': 'bestvideo+bestaudio/best', - 'outtmpl': '/dev/null', - 'writedescription': True, - 'writeinfojson': True, - 'writesubtitles': False, - 'subtitlesformat': 'srt/ass/vtt/best', - 'cachedir': '/tmp/', - 'noplaylist': False, + "format": "bestvideo+bestaudio/best", + "outtmpl": "/dev/null", + "writedescription": True, + "writeinfojson": True, + "writesubtitles": False, + "subtitlesformat": "srt/ass/vtt/best", + "cachedir": "/tmp/", + "noplaylist": False, } - if '.youtube.com/' in url: + if ".youtube.com/" in url: # https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies # https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp - params.update({ - 'cookiefile': tooldir + '/../cookies.txt', - 'username': youtube_user, - 'password': youtube_pass - }) + params.update( + { + "cookiefile": tooldir + "/../cookies.txt", + "username": youtube_user, + "password": youtube_pass, + } + ) with yt_dlp.YoutubeDL(params) as dl: info = dl.extract_info(url, download=False) # Extract playlist entries if this is a playlist. - if info and 'entries' in info: + if info and "entries" in info: videos = [] - for entry in info['entries']: + for entry in info["entries"]: video_info = _extract_info(entry) videos.append(video_info) return { - 'type': 'playlist', - 'id': info.get('id', ''), - 'title': info.get('title', ''), - 'url': url, - 'videos': videos + "type": "playlist", + "id": info.get("id", ""), + "title": info.get("title", ""), + "url": url, + "videos": videos, } video_info = _extract_info(info) - return { 'type': 'single', **video_info } + return {"type": "single", **video_info} def _extract_info(info): """Process metadata for a single video.""" - assert 'formats' in info or info.get('direct'), \ - 'Your url cannot be processed correctly' + assert "formats" in info or info.get("direct"), ( + "Your url cannot be processed correctly" + ) - ie_key = info['extractor_key'] - title = (info.get('title') or '').strip() - url = info.get('webpage_url') + ie_key = info["extractor_key"] + title = (info.get("title") or "").strip() + url = info.get("webpage_url") filedesc = FILEDESC_TEMPLATE % { - 'desc': _desc(url, ie_key, title, info), - 'date': _date(url, ie_key, title, info), - 'source': _source(url, ie_key, title, info), - 'uploader': _uploader(url, ie_key, title, info), - 'license': _license(url, ie_key, title, info) + "desc": _desc(url, ie_key, title, info), + "date": _date(url, ie_key, title, info), + "source": _source(url, ie_key, title, info), + "uploader": _uploader(url, ie_key, title, info), + "license": _license(url, ie_key, title, info), } return { - 'url': url, - 'extractor': ie_key, - 'filedesc': filedesc.strip(), - 'filename': sanitize(title), - 'date': _date(url, ie_key, title, info) + "url": url, + "extractor": ie_key, + "filedesc": filedesc.strip(), + "filename": sanitize(title), + "date": _date(url, ie_key, title, info), } def _date(url, ie_key, title, info): - date = (info.get('upload_date') or '').strip() - if re.match(r'^[0-9]{8}$', date): - date = '%s-%s-%s' % (date[0:4], date[4:6], date[6:8]) + date = (info.get("upload_date") or "").strip() + if re.match(r"^[0-9]{8}$", date): + date = "%s-%s-%s" % (date[0:4], date[4:6], date[6:8]) return date def _source(url, ie_key, title, info): - if info['id']: - if ie_key == 'Youtube': - return '{{From YouTube|1=%(id)s|2=%(title)s}}' % \ - {'id': info['id'], 'title': escape_wikitext(title)} - elif ie_key == 'Vimeo': - return '{{From Vimeo|1=%(id)s|2=%(title)s}}' % \ - {'id': info['id'], 'title': escape_wikitext(title)} - - if ie_key == 'Generic': + if info["id"]: + if ie_key == "Youtube": + return "{{From YouTube|1=%(id)s|2=%(title)s}}" % { + "id": info["id"], + "title": escape_wikitext(title), + } + elif ie_key == "Vimeo": + return "{{From Vimeo|1=%(id)s|2=%(title)s}}" % { + "id": info["id"], + "title": escape_wikitext(title), + } + + if ie_key == "Generic": return url else: - if ':' in info['extractor']: + if ":" in info["extractor"]: # Try to find the anme of the 'owner' of this sub-ie - ie_tmp = info['extractor'][:info['extractor'].index(':')] + ie_tmp = info["extractor"][: info["extractor"].index(":")] for ie in yt_dlp.gen_extractors(): if ie.IE_NAME == ie_tmp: ie_key = ie.ie_key() break - return '[%(url)s %(title)s - %(extractor)s]' % \ - {'url': url, 'title': escape_wikitext(title), 'extractor': ie_key} + return "[%(url)s %(title)s - %(extractor)s]" % { + "url": url, + "title": escape_wikitext(title), + "extractor": ie_key, + } def _desc(url, ie_key, title, info): - desc_orig = desc = (info.get('description') or '').strip() or title + desc_orig = desc = (info.get("description") or "").strip() or title desc = escape_wikitext(desc) if len(desc_orig) > 100: lang = guess_language.guess_language(desc_orig) - if lang != 'UNKNOWN': - desc = '{{' + lang + '|1=' + desc + '}}' + if lang != "UNKNOWN": + desc = "{{" + lang + "|1=" + desc + "}}" return desc def _uploader(url, ie_key, title, info): - uploader = escape_wikitext((info.get('uploader') or '').strip()) - uploader_url = info.get('uploader_url') or '' + uploader = escape_wikitext((info.get("uploader") or "").strip()) + uploader_url = info.get("uploader_url") or "" if uploader_url: # HACK: YouTube outputs http:// atm (issue #80) - if ie_key == 'Youtube': - uploader_url = uploader_url.replace('http://', 'https://') - uploader = '[%s %s]' % (uploader_url, uploader) + if ie_key == "Youtube": + uploader_url = uploader_url.replace("http://", "https://") + uploader = "[%s %s]" % (uploader_url, uploader) return uploader def _license(url, ie_key, title, info): - uploader = info.get('uploader') - uploader_param = '' + uploader = info.get("uploader") + uploader_param = "" if uploader: - uploader_param = '|' + escape_wikitext(uploader.strip()) + uploader_param = "|" + escape_wikitext(uploader.strip()) default = DEFAULT_LICENSE - if ie_key == 'Youtube' and info.get('license') == \ - 'Creative Commons Attribution license (reuse allowed)': - if _date(url, ie_key, title, info) <= '2025-08-01': - return '{{YouTube CC-BY%s}}' % uploader_param - return '{{YouTube CC-BY 4.0%s}}' % uploader_param - elif ie_key == 'Flickr': + if ( + ie_key == "Youtube" + and info.get("license") + == "Creative Commons Attribution license (reuse allowed)" + ): + if _date(url, ie_key, title, info) <= "2025-08-01": + return "{{YouTube CC-BY%s}}" % uploader_param + return "{{YouTube CC-BY 4.0%s}}" % uploader_param + elif ie_key == "Flickr": return { - 'Attribution': - '{{cc-by-2.0%s}}' % uploader_param, - 'Attribution-ShareAlike': - '{{cc-by-sa-2.0%s}}' % uploader_param, - 'No known copyright restrictions': - '{{Flickr-no known copyright restrictions}}', - 'United States government work': - '{{PD-USGov}}', - 'Public Domain Dedication (CC0)': - '{{cc-zero}}', - 'Public Domain Work': - '{{safesubst:Flickr-public domain mark/subst}}', - 'Public Domain Mark': - '{{safesubst:Flickr-public domain mark/subst}}', - }.get(info.get('license'), default) - elif ie_key == 'Vimeo': + "Attribution": "{{cc-by-2.0%s}}" % uploader_param, + "Attribution-ShareAlike": "{{cc-by-sa-2.0%s}}" % uploader_param, + "No known copyright restrictions": "{{Flickr-no known copyright restrictions}}", + "United States government work": "{{PD-USGov}}", + "Public Domain Dedication (CC0)": "{{cc-zero}}", + "Public Domain Work": "{{safesubst:Flickr-public domain mark/subst}}", + "Public Domain Mark": "{{safesubst:Flickr-public domain mark/subst}}", + }.get(info.get("license"), default) + elif ie_key == "Vimeo": return { - 'by': - '{{cc-by-3.0%s}}' % uploader_param, - 'by-sa': - '{{cc-by-sa-3.0%s}}' % uploader_param, - 'cc0': - '{{cc-zero}}', - }.get(info.get('license'), default) - elif ie_key == 'PeerTube': + "by": "{{cc-by-3.0%s}}" % uploader_param, + "by-sa": "{{cc-by-sa-3.0%s}}" % uploader_param, + "cc0": "{{cc-zero}}", + }.get(info.get("license"), default) + elif ie_key == "PeerTube": return { - 'Attribution': - '{{cc-by-4.0%s}}' % uploader_param, - 'Attribution - Share Alike': - '{{cc-by-sa-4.0%s}}' % uploader_param, - 'Public Domain Dedication': - '{{cc-zero}}', - }.get(info.get('license'), default) + "Attribution": "{{cc-by-4.0%s}}" % uploader_param, + "Attribution - Share Alike": "{{cc-by-sa-4.0%s}}" % uploader_param, + "Public Domain Dedication": "{{cc-zero}}", + }.get(info.get("license"), default) return default def escape_wikitext(wikitext): """Escape wikitext for use in file description.""" - rep = OrderedDict([ - ('{|', '{{(}}|'), - ('|}', '|{{)}}'), - ('||', '||'), - ('|', '|'), - ('[[', '{{!((}}'), - (']]', '{{))!}}'), - ('{{', '{{((}}'), - ('}}', '{{))}}'), - ('{', '{{(}}'), - ('}', '{{)}}'), - ]) + rep = OrderedDict( + [ + ("{|", "{{(}}|"), + ("|}", "|{{)}}"), + ("||", "||"), + ("|", "|"), + ("[[", "{{!((}}"), + ("]]", "{{))!}}"), + ("{{", "{{((}}"), + ("}}", "{{))}}"), + ("{", "{{(}}"), + ("}", "{{)}}"), + ] + ) rep = dict((re.escape(k), v) for k, v in rep.items()) pattern = re.compile("|".join(list(rep.keys()))) return pattern.sub(lambda m: rep[re.escape(m.group(0))], wikitext) @@ -286,93 +285,66 @@ def get_emoji_regexp(): # Sort emoji by length to make sure multi-character emojis are # matched first emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True) - pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')' + pattern = "(" + "|".join(re.escape(u) for u in emojis) + ")" return re.compile(pattern) # Source: mediawiki.Title.js@9df363d sanitationRules = [ # issue #101 - { - 'pattern': get_emoji_regexp(), - 'replace': '' - }, + {"pattern": get_emoji_regexp(), "replace": ""}, # "signature" - { - 'pattern': re.compile(r'~{3}'), - 'replace': '' - }, + {"pattern": re.compile(r"~{3}"), "replace": ""}, # Space, underscore, tab, NBSP and other unusual spaces { - 'pattern': re.compile(r'[ _\u0009\u00A0\u1680\u180E\u2000-\u200A' - r'\u2028\u2029\u202F\u205F\u3000\s]+'), - 'replace': ' ' + "pattern": re.compile( + r"[ _\u0009\u00A0\u1680\u180E\u2000-\u200A" + r"\u2028\u2029\u202F\u205F\u3000\s]+" + ), + "replace": " ", }, # issue #96 - { - 'pattern': re.compile(r'\u200B'), - 'replace': '' - }, + {"pattern": re.compile(r"\u200B"), "replace": ""}, # unicode bidi override characters: Implicit, Embeds, Overrides - { - 'pattern': re.compile(r'[\u200E\u200F\u202A-\u202E]'), - 'replace': '' - }, + {"pattern": re.compile(r"[\u200E\u200F\u202A-\u202E]"), "replace": ""}, # control characters - { - 'pattern': re.compile(r'[\x00-\x1f\x7f]'), - 'replace': '' - }, + {"pattern": re.compile(r"[\x00-\x1f\x7f]"), "replace": ""}, # URL encoding (possibly) - { - 'pattern': re.compile(r'%([0-9A-Fa-f]{2})'), - 'replace': r'% \1' - }, + {"pattern": re.compile(r"%([0-9A-Fa-f]{2})"), "replace": r"% \1"}, # HTML-character-entities { - 'pattern': re.compile(r'&(([A-Za-z0-9\x80-\xff]+|' - r'#[0-9]+|#x[0-9A-Fa-f]+);)'), - 'replace': r'& \1' + "pattern": re.compile( + r"&(([A-Za-z0-9\x80-\xff]+|" + r"#[0-9]+|#x[0-9A-Fa-f]+);)" + ), + "replace": r"& \1", }, # slash, colon (not supported by file systems like NTFS/Windows, # Mac OS 9 [:], ext4 [/]) - { - 'pattern': re.compile(r'[:/#]'), - 'replace': '-' - }, + {"pattern": re.compile(r"[:/#]"), "replace": "-"}, # brackets, greater than - { - 'pattern': re.compile(r'[\]\}>]'), - 'replace': ')' - }, + {"pattern": re.compile(r"[\]\}>]"), "replace": ")"}, # brackets, lower than - { - 'pattern': re.compile(r'[\[\{<]'), - 'replace': '(' - }, + {"pattern": re.compile(r"[\[\{<]"), "replace": "("}, # directory structures { - 'pattern': re.compile(r'^(\.|\.\.|\./.*|\.\./.*|.*/\./.*|' - r'.*/\.\./.*|.*/\.|.*/\.\.)$'), - 'replace': '' + "pattern": re.compile( + r"^(\.|\.\.|\./.*|\.\./.*|.*/\./.*|" + r".*/\.\./.*|.*/\.|.*/\.\.)$" + ), + "replace": "", }, # everything that wasn't covered yet - { - 'pattern': re.compile(r'[|#+?:/\\\u0000-\u001f\u007f]'), - 'replace': '-' - }, + {"pattern": re.compile(r"[|#+?:/\\\u0000-\u001f\u007f]"), "replace": "-"}, # titleblacklist-custom-double-apostrophe - { - 'pattern': re.compile(r"'{2,}"), - 'replace': '"' - }, + {"pattern": re.compile(r"'{2,}"), "replace": '"'}, ] def sanitize(filename): """Sanitize a filename for uploading.""" for rule in sanitationRules: - filename = rule['pattern'].sub(rule['replace'], filename) + filename = rule["pattern"].sub(rule["replace"], filename) return filename @@ -387,40 +359,40 @@ def capitalize_first_letter(input_string): def do_validate_filename(filename): """Validate filename for invalid characters/parts.""" - assert len(filename.encode('utf-8')) <= MAX_FILENAME_SIZE, \ - 'Your filename is too long' - assert len(filename) == len(filename.lstrip()), \ - 'Your filename contains leading spaces' - assert len(filename) == len(filename.rstrip()), \ - 'Your filename contains trailing spaces' + assert len(filename.encode("utf-8")) <= MAX_FILENAME_SIZE, ( + "Your filename is too long" + ) + assert len(filename) == len(filename.lstrip()), ( + "Your filename contains leading spaces" + ) + assert len(filename) == len(filename.rstrip()), ( + "Your filename contains trailing spaces" + ) for rule in sanitationRules: - reobj = rule['pattern'].search(filename) - assert not reobj or reobj.group(0) == ' ', \ - 'Your filename contains an illegal part: %r' % reobj.group(0) + reobj = rule["pattern"].search(filename) + assert not reobj or reobj.group(0) == " ", ( + "Your filename contains an illegal part: %r" % reobj.group(0) + ) - return filename.replace('_', ' ') + return filename.replace("_", " ") def do_validate_filedesc(filedesc): """Validate filename for invalid characters/parts.""" parse = SITE.simple_request( - action='parse', - text=filedesc, - prop='externallinks' + action="parse", text=filedesc, prop="externallinks" ).submit() - externallinks = parse.get('parse', {}).get('externallinks', []) + externallinks = parse.get("parse", {}).get("externallinks", []) if externallinks: - spam = SITE.simple_request( - action='spamblacklist', - url=externallinks - ).submit() + spam = SITE.simple_request(action="spamblacklist", url=externallinks).submit() - assert spam.get('spamblacklist', {}).get('result') != 'blacklisted', \ - ('Your file description matches spam blacklist! Matches: %s' % - ', '.join(spam.get('spamblacklist', {}).get('matches', []))) + assert spam.get("spamblacklist", {}).get("result") != "blacklisted", ( + "Your file description matches spam blacklist! Matches: %s" + % ", ".join(spam.get("spamblacklist", {}).get("matches", [])) + ) return filedesc @@ -437,14 +409,14 @@ def do_validate_filename_unique(filename): # The built in 'capitalize()' method isn't used since it lowers the rest of # the string, which would also break the comparison. conflicting_names = { - capitalize_first_letter(f"{filename}.{format}") - for format in formats + capitalize_first_letter(f"{filename}.{format}") for format in formats } pages = SITE.allpages(prefix=filename, namespace=NAMESPACE_FILE) for page in pages: - assert page.title(with_ns=False) not in conflicting_names, \ - f'A filename with the same name already exists: {page.full_url()}' + assert page.title(with_ns=False) not in conflicting_names, ( + f"A filename with the same name already exists: {page.full_url()}" + ) return filename @@ -460,7 +432,7 @@ def do_validate_youtube_id(youtube_id): """ results = WcqsSession().query(query) - if len(results['results']['bindings']) == 0: + if len(results["results"]["bindings"]) == 0: return None - return results['results']['bindings'][0]['file']['value'] + return results["results"]["bindings"][0]["file"]["value"] diff --git a/video2commons/frontend/wcqs.py b/video2commons/frontend/wcqs.py index 362b96b7..ce46b7df 100644 --- a/video2commons/frontend/wcqs.py +++ b/video2commons/frontend/wcqs.py @@ -6,7 +6,8 @@ from typing import Any from video2commons.frontend.shared import redisconnection -class WcqsSession(): + +class WcqsSession: """This class manages WCQS sessions and executes SPARQL queries. Relevant Documentation: @@ -21,18 +22,18 @@ def query(self, query: str): """Queries the Wikimedia Commons Query Service.""" retry_after_ts = self._check_retry() if retry_after_ts: - retry_after = int((retry_after_ts - datetime.now(timezone.utc)).total_seconds()) - raise RuntimeError( - f'Too many requests, try again in {retry_after} seconds' + retry_after = int( + (retry_after_ts - datetime.now(timezone.utc)).total_seconds() ) + raise RuntimeError(f"Too many requests, try again in {retry_after} seconds") # Make the SPARQL request using the provided query. response = self.session.get( - 'https://commons-query.wikimedia.org/sparql', - params={'query': query}, + "https://commons-query.wikimedia.org/sparql", + params={"query": query}, headers={ - 'Accept': 'application/sparql-results+json', - 'User-Agent': 'video2commons-bot/1.0 (https://video2commons.toolforge.org/)' + "Accept": "application/sparql-results+json", + "User-Agent": "video2commons-bot/1.0 (https://video2commons.toolforge.org/)", }, # Set-Cookie session refresh headers get sent with a 307 redirect. allow_redirects=True, @@ -44,29 +45,27 @@ def query(self, query: str): # # https://wikitech.wikimedia.org/wiki/Robot_policy#Generally_applicable_rules if response.status_code == 429: - retry_after = response.headers.get('Retry-After') or 60 + retry_after = response.headers.get("Retry-After") or 60 self._set_retry(int(retry_after)) - raise RuntimeError( - f'Too many requests, try again in {retry_after} seconds' - ) + raise RuntimeError(f"Too many requests, try again in {retry_after} seconds") # Handle other unexpected response codes. - content_type = response.headers.get('Content-Type') + content_type = response.headers.get("Content-Type") if ( response.status_code < 200 or response.status_code >= 300 - or content_type != 'application/sparql-results+json;charset=utf-8' + or content_type != "application/sparql-results+json;charset=utf-8" ): raise RuntimeError( - f'Got unexpected response from SPARQL ({response.status_code}): {response.text}' + f"Got unexpected response from SPARQL ({response.status_code}): {response.text}" ) return response.json() def _check_retry(self): """Checks if we're rate limited before making SPARQL requests.""" - retry_after = redisconnection.get('wcqs:retry-after') + retry_after = redisconnection.get("wcqs:retry-after") if retry_after: retry_after_ts = datetime.fromisoformat(retry_after) @@ -80,69 +79,72 @@ def _set_retry(self, retry_after: int): retry_after_ts = datetime.now(timezone.utc) + timedelta(seconds=retry_after) redisconnection.setex( - 'wcqs:retry-after', + "wcqs:retry-after", retry_after, - retry_after_ts.replace(tzinfo=timezone.utc).isoformat() + retry_after_ts.replace(tzinfo=timezone.utc).isoformat(), ) def _get_cookies(self) -> list[dict[str, Any]]: """Retrieve cookies from Redis or the filesystem.""" - cookies = redisconnection.get('wcqs:session') + cookies = redisconnection.get("wcqs:session") if cookies: return json.loads(cookies) - current_app.logger.warning('Pulling in WCQS session from file as fallback') + current_app.logger.warning("Pulling in WCQS session from file as fallback") try: # Fallback: Pull in cookies from file. Needed for initial setup. - with open('/data/project/video2commons/wcqs-session.json', 'r') as f: + with open("/data/project/video2commons/wcqs-session.json", "r") as f: return json.load(f) except FileNotFoundError: - raise RuntimeError('No WCQS session found in Redis or filesystem') + raise RuntimeError("No WCQS session found in Redis or filesystem") def _set_cookies(self, cookies: list[dict[str, Any]]): """Load authentication cookies into the session.""" - cookie_dict = {(cookie['domain'], cookie['name']): cookie for cookie in cookies} + cookie_dict = {(cookie["domain"], cookie["name"]): cookie for cookie in cookies} # wcqsOauth is a long lived cookie that wcqs uses to authenticate the # user against commons.wikimedia.org. This cookie is used to refresh # the wcqsSession cookie. - wcqsOauth = cookie_dict.get(('commons-query.wikimedia.org', 'wcqsOauth')) + wcqsOauth = cookie_dict.get(("commons-query.wikimedia.org", "wcqsOauth")) if wcqsOauth: self.session.cookies.set( - name='wcqsOauth', - value=wcqsOauth['value'], - domain=wcqsOauth['domain'], - path=wcqsOauth['path'], - secure=wcqsOauth['secure'], + name="wcqsOauth", + value=wcqsOauth["value"], + domain=wcqsOauth["domain"], + path=wcqsOauth["path"], + secure=wcqsOauth["secure"], expires=None, # Intentional as wcqsOauth is long-lived ) else: - raise RuntimeError('wcqsOauth cookie not found') + raise RuntimeError("wcqsOauth cookie not found") # wcqsSession is a short lived cookie (2 hour lifetime) holding a JWT # that grants query access to wcqs. This cookie is provided in a 307 # redirect to any request that has a valid wcqsOauth cookie but no # valid wcqsSession cookie. - wcqsSession = cookie_dict.get(('commons-query.wikimedia.org', 'wcqsSession')) + wcqsSession = cookie_dict.get(("commons-query.wikimedia.org", "wcqsSession")) if wcqsSession: self.session.cookies.set( - name='wcqsSession', - value=wcqsSession['value'], - domain=wcqsSession['domain'], - path=wcqsSession['path'], - secure=wcqsSession['secure'], - expires=int(wcqsSession['expirationDate']), + name="wcqsSession", + value=wcqsSession["value"], + domain=wcqsSession["domain"], + path=wcqsSession["path"], + secure=wcqsSession["secure"], + expires=int(wcqsSession["expirationDate"]), ) def _save_cookies(self): """Save cookies from the session to Redis.""" - cookies = [{ - 'name': cookie.name, - 'value': cookie.value, - 'domain': cookie.domain, - 'path': cookie.path, - 'expirationDate': cookie.expires, - 'secure': cookie.secure, - } for cookie in self.session.cookies] - - redisconnection.set('wcqs:session', json.dumps(cookies)) + cookies = [ + { + "name": cookie.name, + "value": cookie.value, + "domain": cookie.domain, + "path": cookie.path, + "expirationDate": cookie.expires, + "secure": cookie.secure, + } + for cookie in self.session.cookies + ] + + redisconnection.set("wcqs:session", json.dumps(cookies)) diff --git a/video2commons/shared/stats.py b/video2commons/shared/stats.py index 20f4a096..070bb027 100644 --- a/video2commons/shared/stats.py +++ b/video2commons/shared/stats.py @@ -3,7 +3,7 @@ import json import time -LOCK_KEY = 'stats_lock' +LOCK_KEY = "stats_lock" def collect_worker_stats(conn, inspector): @@ -13,8 +13,8 @@ def collect_worker_stats(conn, inspector): if stats: for _, worker_stats in stats.items(): - pool = worker_stats.get('pool', {}) - max_concurrency = pool.get('max-concurrency', 0) + pool = worker_stats.get("pool", {}) + max_concurrency = pool.get("max-concurrency", 0) total_capacity += max_concurrency active_tasks = inspector.active() @@ -26,26 +26,26 @@ def collect_worker_stats(conn, inspector): total_active += len(tasks) for task in tasks: - task_id = task.get('id') + task_id = task.get("id") if task_id: task_ids.append(task_id) queue_length = get_queue_length(conn) return { - 'task_ids': task_ids, - 'pending': queue_length, - 'capacity': total_capacity, - 'processing': total_active, - 'available': total_capacity - total_active, - 'utilization': (total_active / total_capacity) if total_capacity > 0 else 0, - 'last_updated_by_job': int(time.time()), + "task_ids": task_ids, + "pending": queue_length, + "capacity": total_capacity, + "processing": total_active, + "available": total_capacity - total_active, + "utilization": (total_active / total_capacity) if total_capacity > 0 else 0, + "last_updated_by_job": int(time.time()), } def get_queue_length(conn): """Get the number of messages waiting in the broker queue.""" - return conn.llen('celery') + conn.hlen('unacked') + return conn.llen("celery") + conn.hlen("unacked") def update_task_stats(conn, task_id, remove=False): @@ -56,34 +56,36 @@ def update_task_stats(conn, task_id, remove=False): raise RuntimeError("Could not acquire write lock on stats key.") try: - serialized_stats = conn.get('stats') + serialized_stats = conn.get("stats") if not serialized_stats: raise RuntimeError("No stats are available, aborting.") stats = json.loads(serialized_stats) if not remove: - stats['task_ids'].append(task_id) + stats["task_ids"].append(task_id) else: # This can fail with a ValueError, but that's fine since we don't # want to write to the key if this happens anyway. - stats['task_ids'].remove(task_id) + stats["task_ids"].remove(task_id) - stats['processing'] = len(stats['task_ids']) - stats['available'] = stats['capacity'] - stats['processing'] - stats['utilization'] = (stats['processing'] / stats['capacity']) if stats['capacity'] > 0 else 0 + stats["processing"] = len(stats["task_ids"]) + stats["available"] = stats["capacity"] - stats["processing"] + stats["utilization"] = ( + (stats["processing"] / stats["capacity"]) if stats["capacity"] > 0 else 0 + ) # Update the queued tasks counter, which only tracks tasks that haven't # been picked up by any workers yet. if not remove: - stats['pending'] = max(stats['pending'] - 1, 0) + stats["pending"] = max(stats["pending"] - 1, 0) # FAILSAFE: We shouldn't get weird numbers in stats, but be safe. if ( - stats['available'] > stats['capacity'] - or stats['available'] < 0 - or stats['processing'] > stats['capacity'] - or stats['processing'] < 0 + stats["available"] > stats["capacity"] + or stats["available"] < 0 + or stats["processing"] > stats["capacity"] + or stats["processing"] < 0 ): raise RuntimeError("Received invalid stats, aborting.") @@ -100,12 +102,12 @@ def increment_queue_counter(conn): raise RuntimeError("Could not acquire write lock on stats key.") try: - serialized_stats = conn.get('stats') + serialized_stats = conn.get("stats") if not serialized_stats: raise RuntimeError("No stats are available, aborting.") stats = json.loads(serialized_stats) - stats['pending'] = stats.get('pending', 0) + 1 + stats["pending"] = stats.get("pending", 0) + 1 update_worker_stats(conn, stats) finally: @@ -116,7 +118,7 @@ def acquire_write_lock(conn): """Acquire a write lock on the stats key (~1 second timeout).""" for _ in range(10): try: - lock_acquired = conn.set(LOCK_KEY, '1', nx=True, ex=2) + lock_acquired = conn.set(LOCK_KEY, "1", nx=True, ex=2) if lock_acquired: return True except Exception: @@ -137,7 +139,7 @@ def release_write_lock(conn): def get_worker_stats(conn): """Get worker stats from Redis.""" - serialized_stats = conn.get('stats') + serialized_stats = conn.get("stats") if not serialized_stats: return None @@ -146,4 +148,4 @@ def get_worker_stats(conn): def update_worker_stats(conn, stats): """Update worker stats in Redis.""" - conn.set('stats', json.dumps(stats)) + conn.set("stats", json.dumps(stats)) diff --git a/video2commons/user-config.py b/video2commons/user-config.py index 6e909215..884b1ac1 100644 --- a/video2commons/user-config.py +++ b/video2commons/user-config.py @@ -4,7 +4,7 @@ """Pywikibot configs.""" -family = 'commons' -mylang = 'commons' +family = "commons" +mylang = "commons" socket_timeout = 30, 300 # chunked uploading unreliable diff --git a/www/python/src/app.py b/www/python/src/app.py index 6a0c169f..22ad835c 100644 --- a/www/python/src/app.py +++ b/www/python/src/app.py @@ -19,13 +19,12 @@ """video2commons web frontend wrapper.""" - - import os import sys + sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../../../") from video2commons.frontend import app # NOQA -if __name__ == '__main__': +if __name__ == "__main__": app.run() diff --git a/www/python/src/user-config.py b/www/python/src/user-config.py index 6e909215..884b1ac1 100644 --- a/www/python/src/user-config.py +++ b/www/python/src/user-config.py @@ -4,7 +4,7 @@ """Pywikibot configs.""" -family = 'commons' -mylang = 'commons' +family = "commons" +mylang = "commons" socket_timeout = 30, 300 # chunked uploading unreliable