diff --git a/.github/workflows/dataset_checks.yml b/.github/workflows/dataset_checks.yml new file mode 100644 index 0000000..5305e98 --- /dev/null +++ b/.github/workflows/dataset_checks.yml @@ -0,0 +1,35 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: dataset checks + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main", "essd_review" ] + +permissions: + contents: read + +jobs: + test: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v5 + with: + fetch-depth: 0 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest pandas geopandas delayed-assert gitpython + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Run Dataset Checks + run: | + pytest diff --git a/essd/environment.yml b/environment.yml similarity index 77% rename from essd/environment.yml rename to environment.yml index f3be0f3..b321715 100644 --- a/essd/environment.yml +++ b/environment.yml @@ -10,3 +10,6 @@ dependencies: - squarify - geoutils - cartopy + - pytest + - gitpython + - delayed-assert diff --git a/scripts/test_submission.py b/scripts/test_submission.py new file mode 100644 index 0000000..a9ad057 --- /dev/null +++ b/scripts/test_submission.py @@ -0,0 +1,112 @@ +from pathlib import Path +from glob import glob +from delayed_assert import expect, assert_expectations +from git import Repo +import pandas as pd +import geopandas as gpd + + +rgi_regions = ['RGI2000-v7.0-G-01_alaska', + 'RGI2000-v7.0-G-02_western_canada_usa', + 'RGI2000-v7.0-G-03_arctic_canada_north', + 'RGI2000-v7.0-G-04_arctic_canada_south', + 'RGI2000-v7.0-G-05_greenland_periphery', + 'RGI2000-v7.0-G-06_iceland', + 'RGI2000-v7.0-G-07_svalbard_jan_mayen', + 'RGI2000-v7.0-G-08_scandinavia', + 'RGI2000-v7.0-G-09_russian_arctic', + 'RGI2000-v7.0-G-10_north_asia', + 'RGI2000-v7.0-G-11_central_europe', + 'RGI2000-v7.0-G-12_caucasus_middle_east', + 'RGI2000-v7.0-G-13_central_asia', + 'RGI2000-v7.0-G-14_south_asia_west', + 'RGI2000-v7.0-G-15_south_asia_east', + 'RGI2000-v7.0-G-16_low_latitudes', + 'RGI2000-v7.0-G-17_southern_andes', + 'RGI2000-v7.0-G-18_new_zealand', + 'RGI2000-v7.0-G-19_subantarctic_antarctic_islands' +] + +def test_columns(): + """ + Checks that each new/updated file in dataset/csv and dataset/contributor_files has the correct columns, based on + what is found in lake_term_data_template.csv: + + rgi_id,lake_cat,image_id,image_date,inventory_doi,contributor + + Raises an AssertionError if one or more files is missing one or more columns. + + """ + # check all files in dataset/contributor_files/ + contribs = [f"dataset/csv/{fn}" for fn in glob('**/*.csv', + root_dir='dataset/csv', + recursive=True)] \ + + [f"dataset/contributor_files/{fn}" for fn in glob('**/*.csv', + root_dir='dataset/contributor_files', + recursive=True)] + + # get a list of "new" or changed files from the current branch + repo = Repo('.') + diff = [item.a_path for item in repo.index.diff('origin/main')] + + new_contribs = list(set(contribs) & set(diff)) + + print(f"Found {len(new_contribs)} new or changed submissions: ") + for fn in new_contribs: + print(fn) + + # required columns + req_cols = pd.read_csv('lake_term_data_template.csv').columns + + for fn_csv in new_contribs: + csv = pd.read_csv(fn_csv) + # first, check that columns are all there. + for col in req_cols: + expect(col in csv.columns, f"{col} not found in {fn_csv}: \n{list(csv.columns)}") + + assert_expectations() + + +def test_geopackage(): + """ + Tests whether all geopackage files (a) exist for each region, and (b) have the correct column names. + """ + req_cols = pd.read_csv('lake_term_data_template.csv').columns + + for reg in rgi_regions: + expect(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg").exists(), + f"geopackage file not found in dataset/lakeflags/ for {reg}") + expect(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg").exists(), + f"geopackage file not found in dataset/outlines/ for {reg}") + + lakeflag = gpd.read_file(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg")) + outlines = gpd.read_file(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg")) + + for col in req_cols: + expect(col in lakeflag.columns, f"{col} not found in {reg} lakeflag file: \n{list(lakeflag.columns)}") + expect(col in outlines.columns, f"{col} not found in {reg} outlines file: \n{list(outlines.columns)}") + + assert_expectations() + + +def test_lake_cat(): + """ + Tests whether the lake_cat value is the same in the csv tables and the geopackage files for all regions. + """ + + for reg in rgi_regions: + attributes = pd.read_csv(Path('dataset', 'csv', f"{reg}_lakeflag.csv")).set_index('rgi_id') + + lakeflag = gpd.read_file(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg")).set_index('rgi_id') + outlines = gpd.read_file(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg")).set_index('rgi_id') + + # can compare these directly, as they should be identically indexed (and if not, it's an error) + expect((attributes['lake_cat'] == lakeflag['lake_cat']).all(), + f"lake_cat doesn't match for {reg} points file.") + + # have to first select from attributes where index is also in outlines + same_index = attributes.index[attributes.index.isin(outlines.index)] + expect((attributes.loc[same_index, 'lake_cat'] == outlines['lake_cat']).all(), + f"lake_cat doesn't match for {reg} outlines.") + + assert_expectations()