From ad4c028cd37f2173a510a31786f20cb489acd9f8 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 7 Jun 2025 03:03:14 +0000 Subject: [PATCH 1/4] windows workaround; deprecate ubuntu-20.04 --- .github/workflows/ci.yaml | 4 ++-- duck.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e97955e..cbb5dce 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -20,7 +20,7 @@ jobs: EXTRA: [false] # used to force includes to get included include: - python-version: '3.7' - os: ubuntu-20.04 # oldest version on github actions + os: ubuntu-22.04 # oldest version on github actions EXTRA: true - python-version: '3.11' os: macos-latest @@ -28,7 +28,7 @@ jobs: - python-version: '3.13' os: macos-latest EXTRA: true - - python-version: '3.7' + - python-version: '3.8' # duckdb didn't compile for 3.7 and doze 2022 10.0.20348 os: windows-latest EXTRA: true - python-version: '3.13' diff --git a/duck.py b/duck.py index f9b8e85..c288e0f 100644 --- a/duck.py +++ b/duck.py @@ -5,6 +5,7 @@ import subprocess import sys import gzip +import platform import duckdb @@ -82,6 +83,7 @@ def get_files(algo, crawl): def main(algo, crawl): + windows = True if platform.system() == 'Windows' else False files = get_files(algo, crawl) retries_left = 100 @@ -108,7 +110,8 @@ def main(algo, crawl): retries_left = 100 while True: try: - print(duckdb.sql('SELECT COUNT(*) FROM ccindex;')) + # if you print this in windows, it fails: UnicodeEncodeError: 'charmap' codec can't encode characters in position 0-15: character maps to + res = duckdb.sql('SELECT COUNT(*) FROM ccindex;') break except duckdb.InvalidInputException as e: # duckdb.duckdb.InvalidInputException: Invalid Input Error: No magic bytes found at end of file 'https://...' @@ -119,6 +122,7 @@ def main(algo, crawl): retries_left -= 1 else: raise + print(res) # windows workaround sq2 = f''' select From ce0188b7bfc33e12f3064e1f73c7f7352dd6c722 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 7 Jun 2025 03:26:48 +0000 Subject: [PATCH 2/4] duckdb is py3.8 min, work around windows quirk --- .github/workflows/ci.yaml | 5 +++-- duck.py | 9 +++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index cbb5dce..78dcd65 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -19,7 +19,8 @@ jobs: os: [ubuntu-latest] EXTRA: [false] # used to force includes to get included include: - - python-version: '3.7' + - python-version: '3.8' + # see https://github.com/duckdb/duckdb/blob/main/.github/workflows/Python.yml for duckdb python versions os: ubuntu-22.04 # oldest version on github actions EXTRA: true - python-version: '3.11' @@ -28,7 +29,7 @@ jobs: - python-version: '3.13' os: macos-latest EXTRA: true - - python-version: '3.8' # duckdb didn't compile for 3.7 and doze 2022 10.0.20348 + - python-version: '3.8' os: windows-latest EXTRA: true - python-version: '3.13' diff --git a/duck.py b/duck.py index c288e0f..af1c677 100644 --- a/duck.py +++ b/duck.py @@ -2,10 +2,10 @@ import glob import json import os.path -import subprocess import sys import gzip import platform +import io import duckdb @@ -84,6 +84,9 @@ def get_files(algo, crawl): def main(algo, crawl): windows = True if platform.system() == 'Windows' else False + if windows: + # windows stdout is often cp1252 + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') files = get_files(algo, crawl) retries_left = 100 @@ -110,8 +113,7 @@ def main(algo, crawl): retries_left = 100 while True: try: - # if you print this in windows, it fails: UnicodeEncodeError: 'charmap' codec can't encode characters in position 0-15: character maps to - res = duckdb.sql('SELECT COUNT(*) FROM ccindex;') + print(duckdb.sql('SELECT COUNT(*) FROM ccindex;')) break except duckdb.InvalidInputException as e: # duckdb.duckdb.InvalidInputException: Invalid Input Error: No magic bytes found at end of file 'https://...' @@ -122,7 +124,6 @@ def main(algo, crawl): retries_left -= 1 else: raise - print(res) # windows workaround sq2 = f''' select From db23de6ad7407b0b000a1386697c0029aef9bf7a Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 7 Jun 2025 03:52:20 +0000 Subject: [PATCH 3/4] maybe this will fix windows --- .github/workflows/ci.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 78dcd65..e121d71 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -73,4 +73,5 @@ jobs: run: make duck_cloudfront - name: make wreck_the_warc + shell: bash # or windows will throw an error run: make wreck_the_warc From 3c4e03f0979e0b1e62bdf4fa54948b72057f4ee7 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 7 Jun 2025 04:00:49 +0000 Subject: [PATCH 4/4] maybe this will fix windows --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index bfbe5ae..8d6d7a4 100644 --- a/Makefile +++ b/Makefile @@ -78,7 +78,7 @@ wreck_the_warc: @echo we will break and then fix this warc cp whirlwind.warc.gz testing.warc.gz rm -f testing.warc - gunzip testing.warc.gz + gzip -d testing.warc.gz # windows gunzip no work-a @echo @echo iterate over this uncompressed warc: works python ./warcio-iterator.py testing.warc @@ -90,7 +90,7 @@ wreck_the_warc: python ./warcio-iterator.py testing.warc.gz || /usr/bin/true @echo @echo "now let's do it the right way" - gunzip testing.warc.gz + gzip -d testing.warc.gz warcio recompress testing.warc testing.warc.gz @echo @echo and now iterating works