Add windows testing and update path construction (#171)

cthoyt · hrshdhgd · web-flow · commit f83b483c0c52 · 2022-02-07T11:08:28.000+02:00
* Add windows testing

* Add pathlib support

Also cleans up some suspect file path concatenation in testing

* linted and flake8 compliant

Co-authored-by: Harshad &lt;hrshdhgd@users.noreply.github.com&gt;
Co-authored-by: Harshad Hegde &lt;hegdehb@gmail.com&gt;
diff --git a/.github/workflows/qc.yml b/.github/workflows/qc.yml
@@ -10,10 +10,11 @@ on:
 jobs:
   build:
 
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
     strategy:
       matrix:
         python-version: [3.7, 3.8, 3.9]
+        os: [ ubuntu-latest, windows-latest ]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/sssom/parsers.py b/sssom/parsers.py
@@ -5,6 +5,7 @@
 import re
 import typing
 from collections import Counter
+from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Set, TextIO, Tuple, Union, cast
 from urllib.request import urlopen
 from xml.dom import Node, minidom
@@ -47,7 +48,7 @@
 
 
 def read_sssom_table(
-    file_path: str,
+    file_path: Union[str, Path],
     prefix_map: Optional[PrefixMap] = None,
     meta: Optional[MetadataType] = None,
 ) -> MappingSetDataFrame:
@@ -571,8 +572,16 @@ def _swap_object_subject(mapping: Mapping) -> Mapping:
     return mapping
 
 
-def _read_metadata_from_table(path: str) -> Dict[str, Any]:
-    if validators.url(path):
+def _read_metadata_from_table(path: Union[str, Path]) -> Dict[str, Any]:
+    if isinstance(path, Path) or not validators.url(path):
+        with open(path) as file:
+            yamlstr = ""
+            for line in file:
+                if line.startswith("#"):
+                    yamlstr += re.sub("^#", "", line)
+                else:
+                    break
+    else:
         response = urlopen(path)
         yamlstr = ""
         for lin in response:
@@ -581,14 +590,7 @@ def _read_metadata_from_table(path: str) -> Dict[str, Any]:
                 yamlstr += re.sub("^#", "", line)
             else:
                 break
-    else:
-        with open(path) as file:
-            yamlstr = ""
-            for line in file:
-                if line.startswith("#"):
-                    yamlstr += re.sub("^#", "", line)
-                else:
-                    break
+
     if yamlstr:
         meta = yaml.safe_load(yamlstr)
         logging.info(f"Meta={meta}")
diff --git a/sssom/util.py b/sssom/util.py
@@ -8,6 +8,7 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
 from io import StringIO
+from pathlib import Path
 from typing import (
     Any,
     DefaultDict,
@@ -719,7 +720,7 @@ def inject_metadata_into_df(msdf: MappingSetDataFrame) -> MappingSetDataFrame:
     return msdf
 
 
-def get_file_extension(file: Union[str, TextIO]) -> str:
+def get_file_extension(file: Union[str, Path, TextIO]) -> str:
     """Get file extension.
 
     :param file: File path
@@ -728,6 +729,8 @@ def get_file_extension(file: Union[str, TextIO]) -> str:
     """
     if isinstance(file, str):
         filename = file
+    elif isinstance(file, Path):
+        return file.suffix
     else:
         filename = file.name
     parts = filename.split(".")
@@ -739,7 +742,7 @@ def get_file_extension(file: Union[str, TextIO]) -> str:
 
 
 def read_csv(
-    filename: Union[str, TextIO], comment: str = "#", sep: str = ","
+    filename: Union[str, Path, TextIO], comment: str = "#", sep: str = ","
 ) -> pd.DataFrame:
     """Read a CSV that contains frontmatter commented by a specific character.
 
@@ -753,7 +756,10 @@ def read_csv(
     """
     if isinstance(filename, TextIO):
         return pd.read_csv(filename, sep=sep)
-    if validators.url(filename):
+    if isinstance(filename, Path) or not validators.url(filename):
+        with open(filename, "r") as f:
+            lines = "".join([line for line in f if not line.startswith(comment)])
+    else:
         response = urlopen(filename)
         lines = "".join(
             [
@@ -762,9 +768,6 @@ def read_csv(
                 if not line.decode("utf-8").startswith(comment)
             ]
         )
-    else:
-        with open(filename, "r") as f:
-            lines = "".join([line for line in f if not line.startswith(comment)])
     return pd.read_csv(StringIO(lines), sep=sep)
 
 
@@ -778,7 +781,9 @@ def read_metadata(filename: str) -> Metadata:
     return Metadata(prefix_map=prefix_map, metadata=metadata)
 
 
-def read_pandas(file: Union[str, TextIO], sep: Optional[str] = None) -> pd.DataFrame:
+def read_pandas(
+    file: Union[str, Path, TextIO], sep: Optional[str] = None
+) -> pd.DataFrame:
     """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
 
     :param file: The file to read. If no separator is given, this file should be named.
@@ -982,14 +987,17 @@ def prepare_context_str(prefix_map: Optional[PrefixMap] = None, **kwargs) -> str
     return json.dumps(prepare_context(prefix_map), **kwargs)
 
 
-def raise_for_bad_path(file_path: str) -> None:
+def raise_for_bad_path(file_path: Union[str, Path]) -> None:
     """Raise exception if file path is invalid.
 
     :param file_path: File path
-    :raises ValueError: Invalid file path
+    :raises FileNotFoundError: Invalid file path
     """
-    if not validators.url(file_path) and not os.path.exists(file_path):
-        raise ValueError(f"{file_path} is not a valid file path or url.")
+    if isinstance(file_path, Path):
+        if not file_path.is_file():
+            raise FileNotFoundError(f"{file_path} is not a valid file path or url.")
+    elif not validators.url(file_path) and not os.path.exists(file_path):
+        raise FileNotFoundError(f"{file_path} is not a valid file path or url.")
 
 
 def is_multivalued_slot(slot: str) -> bool:
diff --git a/tests/constants.py b/tests/constants.py
@@ -1,10 +1,9 @@
 """Constants for test cases."""
 
-import os
 import pathlib
 
 cwd = pathlib.Path(__file__).parent.resolve()
-data_dir = os.path.join(cwd, "data")
+data_dir = cwd / "data"
 
-test_out_dir = os.path.join(cwd, "tmp")
-os.makedirs(test_out_dir, exist_ok=True)
+test_out_dir = cwd / "tmp"
+test_out_dir.mkdir(parents=True, exist_ok=True)
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -113,7 +113,7 @@ def run_parse(self, runner: CliRunner, test_case: SSSOMTestCase) -> Result:
     def run_split(self, runner: CliRunner, test_case: SSSOMTestCase) -> Result:
         """Run the split test."""
         result = runner.invoke(
-            split, [test_case.filepath, "--output-directory", test_out_dir]
+            split, [test_case.filepath, "--output-directory", test_out_dir.as_posix()]
         )
         self.run_successful(result, test_case)
         return result
@@ -172,7 +172,7 @@ def run_partition(
             if not primary_test_case:
                 primary_test_case = t
             params.append(t.filepath)
-        params.extend(["--output-directory", test_out_dir])
+        params.extend(["--output-directory", test_out_dir.as_posix()])
         result = runner.invoke(partition, params)
         self.run_successful(result, primary_test_case)
         return result
diff --git a/tests/test_collapse.py b/tests/test_collapse.py
@@ -20,7 +20,7 @@ class TestCollapse(unittest.TestCase):
 
     def setUp(self) -> None:
         """Set up the test case."""
-        self.df = parse(f"{data_dir}/basic.tsv")
+        self.df = parse(data_dir / "basic.tsv")
 
     def test_row_count(self):
         """Test the dataframe has the correct number of rows."""
@@ -69,7 +69,7 @@ def test_diff(self):
         print(output)
         # print(diff)
 
-        df2 = parse(f"{data_dir}/basic2.tsv")
+        df2 = parse(data_dir / "basic2.tsv")
         diff = compare_dataframes(self.df, df2)
         # print(len(diff.unique_tuples1))
         # print(len(diff.unique_tuples2))
diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -12,8 +12,8 @@ class TestConvert(unittest.TestCase):
 
     def setUp(self) -> None:
         """Set up the test case with two tables."""
-        self.msdf = read_sssom_table(f"{data_dir}/basic.tsv")
-        self.cob = read_sssom_table(f"{data_dir}/cob-to-external.tsv")
+        self.msdf = read_sssom_table(data_dir / "basic.tsv")
+        self.cob = read_sssom_table(data_dir / "cob-to-external.tsv")
 
     def test_df(self):
         """Test the dataframe has the right number of mappings."""
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
@@ -30,9 +30,6 @@ class TestParse(unittest.TestCase):
 
     def setUp(self) -> None:
         """Set up the test case."""
-        if not os.path.exists(test_out_dir):
-            os.mkdir(test_out_dir)
-
         self.df_url = "https://raw.githubusercontent.com/mapping-commons/sssom-py/master/tests/data/basic.tsv"
         self.rdf_graph_file = f"{test_data_dir}/basic.sssom.rdf"
         self.rdf_graph = Graph()
diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py
@@ -13,7 +13,7 @@ class TestReconcile(unittest.TestCase):
 
     def setUp(self) -> None:
         """Test up the test case with the third basic example."""
-        self.msdf = read_sssom_table(f"{data_dir}/basic3.tsv")
+        self.msdf = read_sssom_table(data_dir / "basic3.tsv")
 
     def test_filter(self):
         """Test filtering returns the right number of rows."""
@@ -27,17 +27,17 @@ def test_deal_with_negation(self):
 
     def test_merge(self):
         """Test merging two tables."""
-        msdf1 = read_sssom_table(f"{data_dir}/basic.tsv")
-        msdf2 = read_sssom_table(f"{data_dir}/basic2.tsv")
+        msdf1 = read_sssom_table(data_dir / "basic.tsv")
+        msdf2 = read_sssom_table(data_dir / "basic2.tsv")
 
         merged_msdf = merge_msdf(msdf1=msdf1, msdf2=msdf2)
 
         self.assertEqual(123, len(merged_msdf.df))
 
     def test_merge_no_reconcile(self):
         """Test merging two tables without reconciliation."""
-        msdf1 = read_sssom_table(f"{data_dir}/basic4.tsv")
-        msdf2 = read_sssom_table(f"{data_dir}/basic5.tsv")
+        msdf1 = read_sssom_table(data_dir / "basic4.tsv")
+        msdf2 = read_sssom_table(data_dir / "basic5.tsv")
 
         merged_msdf = merge_msdf(msdf1=msdf1, msdf2=msdf2, reconcile=False)
 
diff --git a/tests/test_rewire.py b/tests/test_rewire.py
@@ -1,5 +1,6 @@
 """Tests for rewiring utilities."""
 
+import os
 import unittest
 
 from rdflib import Graph
@@ -14,9 +15,9 @@ class TestRewire(unittest.TestCase):
 
     def setUp(self) -> None:
         """Set up the test case with the COB mappings et and OWL graph."""
-        self.mset = read_sssom_table(f"{data_dir}/cob-to-external.tsv")
+        self.mset = read_sssom_table(data_dir / "cob-to-external.tsv")
         g = Graph()
-        g.parse(f"{data_dir}/cob.owl", format="xml")
+        g.parse(os.path.join(data_dir, "cob.owl"), format="xml")
         self.graph = g
 
     def test_rewire(self):
@@ -27,5 +28,5 @@ def test_rewire(self):
 
         n = rewire_graph(self.graph, self.mset, precedence=["PR"])
         print(f"Num changed = {n}")
-        with open(f"{test_out_dir}/rewired-cob.ttl", "w") as stream:
+        with open(test_out_dir / "rewired-cob.ttl", "w") as stream:
             stream.write(self.graph.serialize(format="turtle").decode())
diff --git a/tests/test_scc.py b/tests/test_scc.py
@@ -12,7 +12,7 @@ class TestSCC(unittest.TestCase):
 
     def setUp(self) -> None:
         """Set up the test case by reading the basic SSSOM example."""
-        self.mset = read_sssom_table(f"{data_dir}/basic.tsv")
+        self.mset = read_sssom_table(data_dir / "basic.tsv")
 
     def test_scc(self):
         """Test splitting into cliques."""
@@ -23,7 +23,7 @@ def test_scc(self):
     def test_cliquesummary(self):
         """Test summarizing cliques."""
         df = summarize_cliques(self.mset)
-        df.to_csv(f"{data_dir}/basic-cliquesummary.tsv", sep="\t")
+        df.to_csv(data_dir / "basic-cliquesummary.tsv", sep="\t")
         df.describe().transpose().to_csv(
-            f"{data_dir}/basic-cliquesummary-stats.tsv", sep="\t"
+            data_dir / "basic-cliquesummary-stats.tsv", sep="\t"
         )