diff --git a/dandi/cli/cmd_search.py b/dandi/cli/cmd_search.py new file mode 100644 index 000000000..b3c4e6abc --- /dev/null +++ b/dandi/cli/cmd_search.py @@ -0,0 +1,212 @@ +from pathlib import Path + +from SPARQLWrapper import JSON, SPARQLWrapper +import click +import pandas as pd + +from .base import map_to_click_exceptions + +# supported fields +# TODO: adding more +DANDISETS_FIELDS = { + "approach": ["apr", "?as dandi:approach / schema:name ?apr ."], + "species_id": ["sid", "?as dandi:species / schema:identifier ?sid ."], + "species_name": ["snm", "?as dandi:species / schema:name ?snm ."], +} + +ASSETS_FIELDS = { + "size": ["size", "?asset schema:contentSize ?size ."], + "format": ["format", "?asset schema:encodingFormat ?format ."], + "age": ["age", "?asset prov:wasAttributedTo / dandi:age / schema:value ?age ."], +} + + +def parse_validate(ctx, param, value): + value_parse = [] + # parsing elements that have multiple comma-separated values + for el in value: + value_parse += el.split(",") + if param.name == "select_fields": + if ctx.params["search_type"] == "dandisets": + choice_list = DANDISETS_FIELDS.keys() + elif ctx.params["search_type"] == "assets": + choice_list = ASSETS_FIELDS.keys() + else: + choice_list = None + else: + choice_list = None + # checking if all values are in the list of possible choices + for el in value_parse: + if choice_list and el not in choice_list: + ctx.fail(f"{el} is not in the list: {choice_list}") + return value_parse + + +@click.command(help="Search TODO") +@click.option( + "-F", + "--file", + help="Comma-separated list of fields to display. " + "An empty value to trigger a list of " + "available fields to be printed out", +) +@click.option( + "-t", + "--search_type", + help="Type of the search.", + type=click.Choice(["dandisets", "assets"]), +) +@click.option( + "-s", + "--select_fields", + help="Field name for dandisets search", + callback=parse_validate, + multiple=True, +) +@click.option( + "-f", + "--filter_fields", + help="Field name for dandisets search", + type=(str, str), + multiple=True, +) +@click.option( + "--format", + help="Choose the format for output. TODO", + type=click.Choice(["stdout", "csv"]), + default="stdout", +) +@click.option( + "--number_of_lines", + help="Number of lines of output that will be printed", + default=10, +) +@click.option( + "-d", + "--database_name", + help="Database name", + default="dandisets_new", +) +@map_to_click_exceptions +def search( + file=None, + search_type=None, + select_fields=None, + filter_fields=None, + format="stdout", + number_of_lines=10, + database_name="dandisets_new", +): + + if file and search_type: + raise Exception("file and type are mutually exclusive options") + + if file: + filepath = Path(file) + with filepath.open() as f: + query_str = f.read() + elif search_type in ["dandisets", "assets"]: + if not select_fields: + raise Exception( + f"select_fields is required if search type is {search_type}" + ) + if filter_fields: + for el in filter_fields: + if el[0] not in select_fields: + raise Exception( + f"field {el[0]} used in filter_fields, " + f"but select fields contain {select_fields}" + ) + if search_type == "dandisets": + query_str = create_dandisets_query(select_fields, filter_fields) + elif search_type == "assets": + query_str = create_assets_query(select_fields, filter_fields) + else: + raise NotImplementedError + + endpoint = "https://search.dandiarchive.org:5820/dandisets_new/query" + sparql = SPARQLWrapper(endpoint) + sparql.setCredentials("anonymous", "anonymous") + sparql.setReturnFormat(JSON) + sparql.setQuery(query_str) + results = sparql.queryAndConvert() + res_df = results2df(results, number_of_lines) + + if format == "stdout": + print(res_df) + else: + raise NotImplementedError("only stdout format implemented for now") + + +def results2df(results, limit=10): + res_lim = results["results"]["bindings"][:limit] + res_val_l = [dict((k, v["value"]) for k, v in res.items()) for res in res_lim] + return pd.DataFrame(res_val_l) + + +def filter_query(filter_fields, fields_dict): + """creating filter part for the queries""" + filter_str = "" + for (key, val) in filter_fields: + if val[0] == "(" and val[-1] == ")": + val = val[1:-1].split(",") + if len(val) != 2: + raise ValueError( + "If value for filter is a tuple, it has to have 2 elements " + ) + else: + min_val = val[0].strip() + max_val = val[1].strip() + if max_val and min_val: + filter_str += ( + f"FILTER (?{fields_dict[key][0]} > {min_val} " + f"&& ?{fields_dict[key][0]} < {max_val}) \n" + ) + elif max_val: + filter_str += f"FILTER (?{fields_dict[key][0]} < {max_val}) \n" + elif min_val: + filter_str += f"FILTER (?{fields_dict[key][0]} > {min_val}) \n" + else: + val = val.split(",") + cond_str = f'?{fields_dict[key][0]} = "{val[0]}"' + for el in val[1:]: + cond_str += f' || ?{fields_dict[key][0]} = "{el}"' + filter_str += f"FILTER ({cond_str}) \n" + return filter_str + + +def create_dandisets_query(select_fields, filter_fields): + """Creating a query for dandisets search""" + var = "" + for el in select_fields: + var += f" ?{DANDISETS_FIELDS[el][0]}" + + query_str = ( + f"SELECT DISTINCT ?d{var} WHERE \n" "{ \n" " ?d dandi:assetsSummary ?as . \n" + ) + for el in select_fields: + query_str += f" {DANDISETS_FIELDS[el][1]} \n" + query_str += filter_query(filter_fields, DANDISETS_FIELDS) + query_str += "}" + return query_str + + +def create_assets_query(select_fields, filter_fields): + """Creating a query for assets search""" + var = "" + for el in select_fields: + var += f" ?{ASSETS_FIELDS[el][0]}" + + query_str = ( + f"SELECT DISTINCT ?asset ?d_id ?path{var} WHERE \n" + "{ \n" + " ?asset rdf:type dandi:Asset . \n" + " ?d prov:hasMember ?asset . \n" + " ?d schema:identifier ?d_id . \n" + " ?asset dandi:path ?path . \n" + ) + for el in select_fields: + query_str += f" {ASSETS_FIELDS[el][1]} \n" + query_str += filter_query(filter_fields, ASSETS_FIELDS) + query_str += "}" + return query_str diff --git a/dandi/cli/command.py b/dandi/cli/command.py index a3e0770b6..cd95eb87e 100644 --- a/dandi/cli/command.py +++ b/dandi/cli/command.py @@ -145,6 +145,7 @@ def main(ctx, log_level, pdb=False): from .cmd_ls import ls # noqa: E402 from .cmd_move import move # noqa: E402 from .cmd_organize import organize # noqa: E402 +from .cmd_search import search # noqa: E402 from .cmd_service_scripts import service_scripts # noqa: E402 from .cmd_shell_completion import shell_completion # noqa: E402 from .cmd_upload import upload # noqa: E402 @@ -158,6 +159,7 @@ def main(ctx, log_level, pdb=False): ls, move, organize, + search, service_scripts, shell_completion, upload, diff --git a/dandi/cli/tests/test_search.py b/dandi/cli/tests/test_search.py new file mode 100644 index 000000000..bafc4847f --- /dev/null +++ b/dandi/cli/tests/test_search.py @@ -0,0 +1,192 @@ +from click.testing import CliRunner +import pytest + +from dandi.tests.skip import mark + +from ..command import search + +pytestmark = mark.skipif_no_network + + +@pytest.mark.parametrize( + "select_fields, print_fields", + [ + ("approach", ["app"]), + ("species_name", ["snm"]), + ], +) +def test_search_dandiset_select_fields(select_fields, print_fields): + """using select_fields option with single or multiple comma-separated values""" + runner = CliRunner() + r = runner.invoke(search, ["-t", "dandisets", "--select_fields", select_fields]) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for fld in print_fields: + assert fld in out, f"{fld} is not in the output: {out}" + + +@pytest.mark.parametrize( + "select_fields_mult", + [ + (["--select_fields", "approach", "-s", "species_name"]), + (["--select_fields", "approach,species_name"]), + ], +) +def test_search_dandiset_select_fields_mult(select_fields_mult): + """using select_fields option multiple times""" + runner = CliRunner() + r = runner.invoke(search, ["-t", "dandisets"] + select_fields_mult) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for fld in ["apr", "snm"]: + assert fld in out, f"field {fld} is not in the output" + + +@pytest.mark.parametrize( + "filter_fields", [["species_name", "Human"], ["approach", "behavioral approach"]] +) +def test_search_dandiset_check_filter(filter_fields): + """using select_fields option multiple times""" + runner = CliRunner() + r = runner.invoke( + search, + [ + "-t", + "dandisets", + "--select_fields", + "approach,species_name", + "--filter_fields", + ] + + filter_fields, + ) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for ln in out.split("\n")[1:]: + if ln == "": + break + assert filter_fields[1] in ln, f"value {filter_fields[1]} is not in the output" + + +def test_search_dandiset_check_filter_mult(): + """using a filter option multiple times""" + runner = CliRunner() + r = runner.invoke( + search, + [ + "-t", + "dandisets", + "--select_fields", + "approach,species_name", + "-f", + "species_name", + "Human", + "-f", + "approach", + "behavioral approach", + ], + ) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for ln in out.split("\n")[1:-1]: + assert "Human" in ln, "Human is not in the output" + assert "behavioral approach" in ln, "behavioral approach is not in the output" + + +def test_search_dandiset_check_filter_list(): + """using comma-separated list in a filter option""" + runner = CliRunner() + r = runner.invoke( + search, + [ + "-t", + "dandisets", + "--select_fields", + "approach,species_name", + "-f", + "species_name", + "Human,House mouse", + ], + ) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for ln in out.split("\n")[1:-1]: + assert "Human" in ln or "House mouse" in ln, "Human is not in the output" + + +@pytest.mark.parametrize( + "select_fields, print_fields", + [("format", ["format"]), ("size", ["size"]), ("size,format", ["size", "format"])], +) +def test_search_assets_select_fields(select_fields, print_fields): + """using select_fields option with single or multiple comma-separated values""" + runner = CliRunner() + r = runner.invoke(search, ["-t", "assets", "--select_fields", select_fields]) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for fld in print_fields: + assert fld in out, f"{fld} is not in the output: {out}" + + +@pytest.mark.parametrize( + "filter_fields", + [ + ["format", "application/x-nwb"], + ], +) +def test_search_asset_check_filter(filter_fields): + """using select_fields and filter option in assets search""" + runner = CliRunner() + r = runner.invoke( + search, + ["-t", "assets", "--select_fields", "format,size", "--filter_fields"] + + filter_fields, + ) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + for ln in out.split("\n")[1:]: + if ln == "": + break + assert filter_fields[1] in ln, f"value {filter_fields[1]} is not in the output" + + +@pytest.mark.parametrize( + "size_range", [(7 * 10e9, 9 * 10e9), "(, 9*10e9)", "(7*10e9, )"] +) +def test_search_asset_check_filter_range(size_range): + """using range in a filter option for assets search""" + runner = CliRunner() + r = runner.invoke( + search, + [ + "-t", + "assets", + "--select_fields", + "format,size", + "--filter_fields", + "size", + f"{size_range}", + ], + ) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + assert "size" in out.split("\n")[0] + assert len(out.split("\n")) > 1 + + +def test_search_from_file(tmpdir): + """using search command with a file that contains any sprql query""" + query_file = tmpdir / "query.txt" + query = """ + SELECT DISTINCT ?apr WHERE + { + ?as dandi:approach / schema:name ?apr + } + """ + with query_file.open("w") as f: + f.write(query) + runner = CliRunner() + r = runner.invoke(search, ["-F", query_file]) + assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}" + out = r.stdout + assert "apr" in out.split("\n")[0] + assert len(out.split("\n")) > 1 diff --git a/dandi/support/pyout.py b/dandi/support/pyout.py index 90b422428..ebc51d3a1 100644 --- a/dandi/support/pyout.py +++ b/dandi/support/pyout.py @@ -36,7 +36,7 @@ def naturalsize(v): return humanize.naturalsize(v) -def datefmt(v, fmt=u"%Y-%m-%d/%H:%M:%S"): +def datefmt(v, fmt="%Y-%m-%d/%H:%M:%S"): if isinstance(v, datetime.datetime): return v.strftime(fmt) else: @@ -71,8 +71,8 @@ def counts(values): color=dict( interval=[ [0, 1024, "blue"], - [1024, 1024 ** 2, "green"], - [1024 ** 2, None, "red"], + [1024, 1024**2, "green"], + [1024**2, None, "red"], ] ), aggregate=lambda x: naturalsize(sum(x)), diff --git a/setup.cfg b/setup.cfg index 0de477174..b26ce0254 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,6 +57,7 @@ install_requires = requests ~= 2.20 ruamel.yaml >=0.15, <1 semantic-version + sparqlwrapper tenacity zarr ~= 2.10 zip_safe = False