Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-48975: Add show_dataset_types option to queryCollections #1157

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/changes/DM-48975.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Add a new `--show-dataset-types` argument (`-t`) to `butler query-collections`
to list the dataset types in each collection.
Also add a new `--exclude-dataset-types` which allows a comma-separated list
of string globs to be passed in for exclusion when dataset types are shown.
17 changes: 16 additions & 1 deletion python/lsst/daf/butler/cli/cmd/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,21 @@ def prune_datasets(**kwargs: Any) -> None:
case_sensitive=False,
),
)
@click.option(
"-t",
"--show-dataset-types",
is_flag=True,
help="Also show the dataset types registered within each collection.",
)
@click.option(
"--exclude-dataset-types",
type=click.STRING,
multiple=True,
default=["*_config,*_log,*_metadata,packages"],
callback=split_commas,
show_default=True,
help="Dataset types (comma-separated) to exclude. Only valid with --show-dataset-types.",
)
@options_file_option()
def query_collections(*args: Any, **kwargs: Any) -> None:
"""Get the collections whose names match an expression."""
Expand Down Expand Up @@ -454,7 +469,7 @@ def query_dataset_types(*args: Any, **kwargs: Any) -> None:
"""Get the dataset types in a repository."""
table = script.queryDatasetTypes(*args, **kwargs)
if table:
table.pprint_all()
table.pprint_all(align="<")
else:
print("No results. Try --help for more information.")

Expand Down
182 changes: 153 additions & 29 deletions python/lsst/daf/butler/script/queryCollections.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@
from __future__ import annotations

from collections.abc import Iterable
from fnmatch import fnmatch
from typing import Literal

from astropy.table import Table
from astropy.table import Column, Table, hstack, vstack

from .._butler import Butler
from .._butler_collections import CollectionInfo
Expand All @@ -42,6 +43,8 @@
glob: Iterable[str],
collection_type: Iterable[CollectionType],
inverse: bool,
show_dataset_types: bool = False,
exclude_dataset_types: Iterable[str] = [],
) -> Table:
"""Run queryCollections and return the results in Table form.

Expand All @@ -60,6 +63,11 @@
True if parent CHAINED datasets of each dataset should be listed in the
description column, False if children of CHAINED datasets should be
listed.
show_dataset_types : `bool`
If True, also show the dataset types present within each collection.
exclude_dataset_types : `~collections.abc.Iterable` [ `str` ]
A glob-style comma-separated list of dataset types to exclude.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not comma-separated?

Only has an effect if `show_dataset_types` is True.

Returns
-------
Expand All @@ -72,37 +80,65 @@
names=("Name", typeCol, descriptionCol),
dtype=(str, str, str),
)
if show_dataset_types:
table.add_column(Column(name="Dataset Types", dtype=str))

Check warning on line 84 in python/lsst/daf/butler/script/queryCollections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/script/queryCollections.py#L84

Added line #L84 was not covered by tests
butler = Butler.from_config(repo)

def addCollection(info: CollectionInfo, relative: str) -> None:
info_relatives = getattr(info, relative)
# Parent results can be returned in a non-deterministic order, so sort
# them to make the output deterministic.
if relative == "parents":
info_relatives = sorted(info_relatives)
if info_relatives:
collection_table = Table([[info.name], [info.type.name]], names=("Name", typeCol))
description_table = Table(names=(descriptionCol,), dtype=(str,))
for info_relative in info_relatives:
relative_table = Table([[info_relative]], names=(descriptionCol,))
if show_dataset_types:
cinfo = butler.collections.get_info(info_relative, include_summary=True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this code is completely untested.

dataset_types = [""] if not cinfo.dataset_types else cinfo.dataset_types

Check warning on line 100 in python/lsst/daf/butler/script/queryCollections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/script/queryCollections.py#L99-L100

Added lines #L99 - L100 were not covered by tests
if exclude_dataset_types:
dataset_types = [

Check warning on line 102 in python/lsst/daf/butler/script/queryCollections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/script/queryCollections.py#L102

Added line #L102 was not covered by tests
dt
for dt in dataset_types
if not any(fnmatch(dt, pattern) for pattern in exclude_dataset_types)
]
dataset_types = [""] if not dataset_types else dataset_types
types_table = Table({"Dataset Types": sorted(dataset_types)}, dtype=(str,))
relative_table = hstack([relative_table, types_table]).filled("")

Check warning on line 109 in python/lsst/daf/butler/script/queryCollections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/script/queryCollections.py#L107-L109

Added lines #L107 - L109 were not covered by tests
description_table = vstack([description_table, relative_table])
collection_table = hstack([collection_table, description_table]).filled("")
for row in collection_table:
table.add_row(row)
else:
new_row = [info.name, info.type.name]
new_row.extend([""] * (len(table.colnames) - len(new_row)))
table.add_row(new_row)

collections = sorted(
butler.collections.query_info(
glob or "*", collection_types=frozenset(collection_type), include_parents=inverse
glob or "*",
collection_types=frozenset(collection_type),
include_parents=inverse,
include_summary=show_dataset_types,
)
)
if inverse:
for info in collections:
if info.parents:
first = True
for parentName in sorted(info.parents):
table.add_row((info.name if first else "", info.type.name if first else "", parentName))
first = False
else:
table.add_row((info.name, info.type.name, ""))
addCollection(info, "parents")
# If none of the datasets has a parent dataset then remove the
# description column.
if not any(c for c in table[descriptionCol]):
del table[descriptionCol]
else:
for info in collections:
if info.type == CollectionType.CHAINED:
if info.children:
first = True
for child in info.children:
table.add_row((info.name if first else "", info.type.name if first else "", child))
first = False
else:
table.add_row((info.name, info.type.name, ""))
addCollection(info, "children")
else:
table.add_row((info.name, info.type.name, ""))
new_row = [info.name, info.type.name]
new_row.extend([""] * (len(table.colnames) - len(new_row)))
table.add_row(new_row)
# If there aren't any CHAINED datasets in the results then remove the
# description column.
if not any(columnVal == CollectionType.CHAINED.name for columnVal in table[typeCol]):
Expand All @@ -116,6 +152,8 @@
glob: Iterable[str],
collection_type: Iterable[CollectionType],
inverse: bool,
show_dataset_types: bool = False,
exclude_dataset_types: Iterable[str] = [],
) -> Table:
"""Run queryCollections and return the results in a table representing tree
form.
Expand All @@ -134,6 +172,11 @@
True if parent CHAINED datasets of each dataset should be listed in the
description column, False if children of CHAINED datasets should be
listed.
show_dataset_types : `bool`
If True, also show the dataset types present within each collection.
exclude_dataset_types : `~collections.abc.Iterable` [ `str` ]
A glob-style comma-separated list of dataset types to exclude.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By this point it isn't comma-separated any more.

Only has an effect if `show_dataset_types` is True.

Returns
-------
Expand All @@ -144,51 +187,127 @@
names=("Name", "Type"),
dtype=(str, str),
)
if show_dataset_types:
table.add_column(Column(name="Dataset Types", dtype=str))
butler = Butler.from_config(repo, without_datastore=True)

def addCollection(info: CollectionInfo, level: int = 0) -> None:
table.add_row((" " * level + info.name, info.type.name))
collection_table = Table([[" " * level + info.name], [info.type.name]], names=["Name", "Type"])
if show_dataset_types:
dataset_types = [""] if not info.dataset_types else info.dataset_types
if exclude_dataset_types:
dataset_types = [
dt
for dt in dataset_types
if not any(fnmatch(dt, pattern) for pattern in exclude_dataset_types)
]
dataset_types = [""] if not dataset_types else dataset_types
dataset_types_table = Table({"Dataset Types": sorted(dataset_types)}, dtype=(str,))
collection_table = hstack([collection_table, dataset_types_table]).filled("")
for row in collection_table:
table.add_row(row)

if inverse:
assert info.parents is not None # For mypy.
for pname in sorted(info.parents):
pinfo = butler.collections.get_info(pname, include_parents=inverse)
pinfo = butler.collections.get_info(
pname, include_parents=inverse, include_summary=show_dataset_types
)
addCollection(pinfo, level + 1)
else:
if info.type == CollectionType.CHAINED:
for name in info.children:
cinfo = butler.collections.get_info(name)
cinfo = butler.collections.get_info(name, include_summary=show_dataset_types)
addCollection(cinfo, level + 1)

collections = butler.collections.query_info(
glob or "*", collection_types=frozenset(collection_type), include_parents=inverse
glob or "*",
collection_types=frozenset(collection_type),
include_parents=inverse,
include_summary=show_dataset_types,
)
for collection in sorted(collections):
addCollection(collection)
return table


def _getList(
repo: str, glob: Iterable[str], collection_type: Iterable[CollectionType], flatten_chains: bool
repo: str,
glob: Iterable[str],
collection_type: Iterable[CollectionType],
flatten_chains: bool,
show_dataset_types: bool = False,
exclude_dataset_types: Iterable[str] = [],
) -> Table:
"""Return collection results as a table representing a flat list of
collections.

Parameters
----------
repo : `str`
Butler repository location.
glob : `collections.abc.Iterable` of `str`
Wildcards to pass to ``queryCollections``.
collection_type
Same as `queryCollections`
flatten_chains : `bool`
If True, flatten the tree of CHAINED datasets.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
If True, flatten the tree of CHAINED datasets.
If `True`, flatten the tree of CHAINED datasets.

and elsewhere.

show_dataset_types : `bool`
If True, also show the dataset types present within each collection.
exclude_dataset_types : `~collections.abc.Iterable` [ `str` ]
A glob-style comma-separated list of dataset types to exclude.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not comma-separated?

Only has an effect if `show_dataset_types` is True.

Returns
-------
collections : `astropy.table.Table`
Same as `queryCollections`
"""
table = Table(
names=("Name", "Type"),
dtype=(str, str),
)
if show_dataset_types:
table.add_column(Column(name="Dataset Types", dtype=str))

Check warning on line 271 in python/lsst/daf/butler/script/queryCollections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/script/queryCollections.py#L271

Added line #L271 was not covered by tests
butler = Butler.from_config(repo)

def addCollection(info: CollectionInfo) -> None:
collection_table = Table([[info.name], [info.type.name]], names=["Name", "Type"])
if show_dataset_types:
dataset_types = [""] if not info.dataset_types else info.dataset_types

Check warning on line 277 in python/lsst/daf/butler/script/queryCollections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/script/queryCollections.py#L277

Added line #L277 was not covered by tests
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this identical code is used in three places (the filtering of dataset_types) please put it in a helper function.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This branch also untested.

if exclude_dataset_types:
dataset_types = [

Check warning on line 279 in python/lsst/daf/butler/script/queryCollections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/script/queryCollections.py#L279

Added line #L279 was not covered by tests
dt
for dt in dataset_types
if not any(fnmatch(dt, pattern) for pattern in exclude_dataset_types)
]
dataset_types = [""] if not dataset_types else dataset_types
dataset_types_table = Table({"Dataset Types": sorted(dataset_types)}, dtype=(str,))
collection_table = hstack([collection_table, dataset_types_table]).filled("")

Check warning on line 286 in python/lsst/daf/butler/script/queryCollections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/script/queryCollections.py#L284-L286

Added lines #L284 - L286 were not covered by tests
for row in collection_table:
table.add_row(row)

collections = list(
butler.collections.query_info(
glob or "*", collection_types=frozenset(collection_type), flatten_chains=flatten_chains
glob or "*",
collection_types=frozenset(collection_type),
flatten_chains=flatten_chains,
include_summary=show_dataset_types,
)
)
names = [c.name for c in collections]
types = [c.type.name for c in collections]
return Table((names, types), names=("Name", "Type"))
for collection in collections:
addCollection(collection)

return table


def queryCollections(
repo: str,
glob: Iterable[str],
collection_type: Iterable[CollectionType],
chains: Literal["INVERSE-TABLE", "TABLE", "TREE", "INVERSE-TREE", "FLATTEN", "NO-CHILDREN"],
show_dataset_types: bool = False,
exclude_dataset_types: Iterable[str] = [],
) -> Table:
"""Get the collections whose names match an expression.

Expand All @@ -206,17 +325,22 @@
chains : `str`
Affects contents and formatting of results, see
``cli.commands.query_collections``.
show_dataset_types : `bool`, optional
If True, include the dataset types present within each collection.
exclude_dataset_types : `~collections.abc.Iterable` [ `str` ], optional
A glob-style comma-separated list of dataset types to exclude.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comma-separated?

Only has an effect if `show_dataset_types` is True.

Returns
-------
collections : `astropy.table.Table`
A table containing information about collections.
"""
if (inverse := chains == "INVERSE-TABLE") or chains == "TABLE":
return _getTable(repo, glob, collection_type, inverse)
return _getTable(repo, glob, collection_type, inverse, show_dataset_types, exclude_dataset_types)
elif (inverse := chains == "INVERSE-TREE") or chains == "TREE":
return _getTree(repo, glob, collection_type, inverse)
return _getTree(repo, glob, collection_type, inverse, show_dataset_types, exclude_dataset_types)
elif chains == "FLATTEN" or chains == "NO-CHILDREN":
flatten = chains == "FLATTEN"
return _getList(repo, glob, collection_type, flatten)
return _getList(repo, glob, collection_type, flatten, show_dataset_types, exclude_dataset_types)
raise RuntimeError(f"Value for --chains not recognized: {chains}")
Loading
Loading