Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 40 additions & 22 deletions augur/io/metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import csv
import os
from typing import Iterable, Sequence
from typing import Any, Iterable, Iterator, Sequence, overload
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I considered adding types for read_csv_with_index_col() but decided against it, as it relies on fragile imports from pandas._typing.¹

I don't really know the state of python typing, how does this tend to work for non-core libraries? The TS community has been really proactive about publishing separate type libraries for most common JS-only packages, although ensuring the versions are in-sync has been a little painful. Pandas clearly has some types (pd.DataFrame), do they hide FilePath, ReadCsvBuffer etc away because the types aren't stable?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pandas-stubs is basically the same thing (separate type library) and it's official, so sync is not much of an issue.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why pandas._typing is hidden away, but it's explicitly noted in docs.

pandas.api.typing should be stable. v3 includes FilePath and ReadCsvBuffer, but we still target compatibility with v1.

import pandas as pd
import pyfastx
import python_calamine as calamine
Expand All @@ -26,40 +26,60 @@ class InvalidDelimiter(Exception):
pass


# Overloads for different return types based on chunk_size
@overload
def read_metadata(
metadata_file,
delimiters=DEFAULT_DELIMITERS,
columns=None,
id_columns=DEFAULT_ID_COLUMNS,
keep_id_as_column=False,
chunk_size=None,
dtype=None,
metadata_file: str,
delimiters: Sequence[str] = ...,
columns: list[str] | None = ...,
id_columns: Sequence[str] = ...,
keep_id_as_column: bool = ...,
chunk_size: None = None,
dtype: dict[str, Any] | str | None = ...,
) -> pd.DataFrame: ...

@overload
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cost of overloaded signatures allows us to infer the return type based on the chunk_size type, yeah? I'm somewhat ambivalent about this as I find overload types a little confusing, but I presume this has been useful to you when writing scripts etc?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I added this to address the pyright error in #1961:

metadata = read_metadata(

dates, metadata[METADATA_DATE_COLUMN],

error: "__getitem__" method not defined on type "Generator[Unknown, None, None]" (reportIndexIssue)

def read_metadata(
metadata_file: str,
delimiters: Sequence[str] = ...,
columns: list[str] | None = ...,
id_columns: Sequence[str] = ...,
keep_id_as_column: bool = ...,
chunk_size: int = ...,
dtype: dict[str, Any] | str | None = ...,
) -> Iterator[pd.DataFrame]: ...

def read_metadata(
metadata_file: str,
delimiters: Sequence[str] = DEFAULT_DELIMITERS,
columns: list[str] | None = None,
id_columns: Sequence[str] = DEFAULT_ID_COLUMNS,
keep_id_as_column: bool = False,
chunk_size: int | None = None,
dtype: dict[str, Any] | str | None = None,
):
r"""Read metadata from a given filename and into a pandas `DataFrame` or
`TextFileReader` object.
iterator of DataFrames when `chunk_size` is specified.

Parameters
----------
metadata_file : str
metadata_file
Path to a metadata file to load.
delimiters : list of str
delimiters
List of possible delimiters to check for between columns in the metadata.
Only one delimiter will be inferred.
columns : list of str
columns
List of columns to read. If unspecified, read all columns.
id_columns : list of str
id_columns
List of possible id column names to check for, ordered by priority.
Only one id column will be inferred.
keep_id_as_column : bool
keep_id_as_column
If true, keep the resolved id column as a column in addition to setting it as the DataFrame index.
chunk_size : int
chunk_size
Size of chunks to stream from disk with an iterator instead of loading the entire input file into memory.
dtype : dict or str
dtype
Data types to apply to columns in metadata. If unspecified, pandas data type inference will be used.
See documentation for an argument of the same name to `pandas.read_csv()`.
Returns
-------
pandas.DataFrame or `pandas.io.parsers.TextFileReader`

Raises
------
Expand Down Expand Up @@ -97,6 +117,7 @@ def read_metadata(
"skipinitialspace": True,
"na_filter": False,
"low_memory": False,
**PANDAS_READ_CSV_OPTIONS,
}

if chunk_size:
Expand All @@ -107,7 +128,6 @@ def read_metadata(
metadata_file,
iterator=True,
**kwargs,
**PANDAS_READ_CSV_OPTIONS,
)
chunk = metadata.read(nrows=1)
metadata.close()
Expand Down Expand Up @@ -168,13 +188,11 @@ def read_metadata(
return read_csv_with_index_col(
metadata_file,
**kwargs,
**PANDAS_READ_CSV_OPTIONS,
)
else:
return pd.read_csv(
metadata_file,
**kwargs,
**PANDAS_READ_CSV_OPTIONS,
)


Expand Down
6 changes: 6 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,12 @@ def prose_list(items):
# This class can't be referenced.
# <https://github.com/python/cpython/issues/101503>
("py:class", "argparse._SubParsersAction"),

# sphinx-autodoc-typehints resolves pd.DataFrame to the internal path
# pandas.core.frame.DataFrame, which is not in pandas' intersphinx
# inventory (only pandas.DataFrame is).
# <https://github.com/tox-dev/sphinx-autodoc-typehints/issues/47#issuecomment-401403609>
("py:class", "pandas.core.frame.DataFrame"),
]

# -- Cross-project references ------------------------------------------------
Expand Down