diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e97955e..e121d71 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -19,8 +19,9 @@ jobs: os: [ubuntu-latest] EXTRA: [false] # used to force includes to get included include: - - python-version: '3.7' - os: ubuntu-20.04 # oldest version on github actions + - python-version: '3.8' + # see https://github.com/duckdb/duckdb/blob/main/.github/workflows/Python.yml for duckdb python versions + os: ubuntu-22.04 # oldest version on github actions EXTRA: true - python-version: '3.11' os: macos-latest @@ -28,7 +29,7 @@ jobs: - python-version: '3.13' os: macos-latest EXTRA: true - - python-version: '3.7' + - python-version: '3.8' os: windows-latest EXTRA: true - python-version: '3.13' @@ -72,4 +73,5 @@ jobs: run: make duck_cloudfront - name: make wreck_the_warc + shell: bash # or windows will throw an error run: make wreck_the_warc diff --git a/Makefile b/Makefile index bfbe5ae..8d6d7a4 100644 --- a/Makefile +++ b/Makefile @@ -78,7 +78,7 @@ wreck_the_warc: @echo we will break and then fix this warc cp whirlwind.warc.gz testing.warc.gz rm -f testing.warc - gunzip testing.warc.gz + gzip -d testing.warc.gz # windows gunzip no work-a @echo @echo iterate over this uncompressed warc: works python ./warcio-iterator.py testing.warc @@ -90,7 +90,7 @@ wreck_the_warc: python ./warcio-iterator.py testing.warc.gz || /usr/bin/true @echo @echo "now let's do it the right way" - gunzip testing.warc.gz + gzip -d testing.warc.gz warcio recompress testing.warc testing.warc.gz @echo @echo and now iterating works diff --git a/duck.py b/duck.py index f9b8e85..af1c677 100644 --- a/duck.py +++ b/duck.py @@ -2,9 +2,10 @@ import glob import json import os.path -import subprocess import sys import gzip +import platform +import io import duckdb @@ -82,6 +83,10 @@ def get_files(algo, crawl): def main(algo, crawl): + windows = True if platform.system() == 'Windows' else False + if windows: + # windows stdout is often cp1252 + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') files = get_files(algo, crawl) retries_left = 100