Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,17 @@ jobs:
os: [ubuntu-latest]
EXTRA: [false] # used to force includes to get included
include:
- python-version: '3.7'
os: ubuntu-20.04 # oldest version on github actions
- python-version: '3.8'
# see https://github.com/duckdb/duckdb/blob/main/.github/workflows/Python.yml for duckdb python versions
os: ubuntu-22.04 # oldest version on github actions
EXTRA: true
- python-version: '3.11'
os: macos-latest
EXTRA: true
- python-version: '3.13'
os: macos-latest
EXTRA: true
- python-version: '3.7'
- python-version: '3.8'
os: windows-latest
EXTRA: true
- python-version: '3.13'
Expand Down Expand Up @@ -72,4 +73,5 @@ jobs:
run: make duck_cloudfront

- name: make wreck_the_warc
shell: bash # or windows will throw an error
run: make wreck_the_warc
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ wreck_the_warc:
@echo we will break and then fix this warc
cp whirlwind.warc.gz testing.warc.gz
rm -f testing.warc
gunzip testing.warc.gz
gzip -d testing.warc.gz # windows gunzip no work-a
@echo
@echo iterate over this uncompressed warc: works
python ./warcio-iterator.py testing.warc
Expand All @@ -90,7 +90,7 @@ wreck_the_warc:
python ./warcio-iterator.py testing.warc.gz || /usr/bin/true
@echo
@echo "now let's do it the right way"
gunzip testing.warc.gz
gzip -d testing.warc.gz
warcio recompress testing.warc testing.warc.gz
@echo
@echo and now iterating works
Expand Down
7 changes: 6 additions & 1 deletion duck.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import glob
import json
import os.path
import subprocess
import sys
import gzip
import platform
import io

import duckdb

Expand Down Expand Up @@ -82,6 +83,10 @@ def get_files(algo, crawl):


def main(algo, crawl):
windows = True if platform.system() == 'Windows' else False
if windows:
# windows stdout is often cp1252
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
files = get_files(algo, crawl)
retries_left = 100

Expand Down