Skip to content

Commit 0415fff

Browse files
authored
Merge pull request #339 from clamsproject/326-describe-cli
basic `mmif describe` command
2 parents 9c0d3ba + f7eedfa commit 0415fff

5 files changed

Lines changed: 348 additions & 19 deletions

File tree

mmif/utils/cli/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from mmif.utils.cli import describe
12
from mmif.utils.cli import rewind
23
from mmif.utils.cli import source
34

mmif/utils/cli/describe.py

Lines changed: 333 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
import argparse
2+
import hashlib
3+
import json
4+
import sys
5+
import textwrap
6+
from pathlib import Path
7+
from typing import Union, List, Tuple, Optional
8+
9+
from mmif import Mmif
10+
11+
12+
def split_appname_appversion(
13+
long_app_id: str
14+
) -> Tuple[Optional[str], Optional[str]]:
15+
"""
16+
Split app name and version from a long app identifier.
17+
18+
Assumes the identifier looks like "uri://APP_DOMAIN/APP_NAME/APP_VERSION"
19+
20+
:param long_app_id: Full app identifier URI
21+
:return: Tuple of (app_name, app_version), either may be None if not found
22+
"""
23+
app_path = Path(long_app_id).parts
24+
app_name = app_path[2] if len(app_path) > 2 else None
25+
app_version = app_path[3] if len(app_path) > 3 else None
26+
if (app_version is not None and app_name is not None
27+
and app_name.endswith(app_version)):
28+
app_name = app_name[:-len(app_version) - 1]
29+
if app_version == 'unresolvable':
30+
app_version = None
31+
return app_name, app_version
32+
33+
34+
def generate_param_hash(params: dict) -> str:
35+
"""
36+
Generate MD5 hash from a parameter dictionary.
37+
38+
Parameters are sorted alphabetically, joined as key=value pairs,
39+
and hashed using MD5. This is not for security purposes, only for
40+
generating consistent identifiers.
41+
42+
:param params: Dictionary of parameters
43+
:return: MD5 hash string (32 hex characters)
44+
"""
45+
if not params:
46+
param_string = ""
47+
else:
48+
param_list = ['='.join([k, str(v)]) for k, v in params.items()]
49+
param_list.sort()
50+
param_string = ','.join(param_list)
51+
return hashlib.md5(param_string.encode('utf-8')).hexdigest()
52+
53+
54+
def get_pipeline_specs(
55+
mmif_file: Union[str, Path]
56+
) -> Tuple[
57+
List[Tuple[str, Optional[str], dict, Optional[str], Optional[dict], int, dict]],
58+
List[str], List[str], List[str]
59+
]:
60+
"""
61+
Read a MMIF file and extract the pipeline specification from it.
62+
63+
Extracts app configurations, profiling data, and annotation statistics
64+
for each contentful view. Views with errors, warnings, or no annotations
65+
are tracked separately.
66+
67+
:param mmif_file: Path to the MMIF file
68+
:return: Tuple of (spec_list, error_views, warning_views, empty_views)
69+
where spec_list contains tuples of (view_id, app_name, configs,
70+
running_time_ms, running_hardware, annotation_count,
71+
annotations_by_type) for each contentful view, and the three
72+
lists contain view IDs for error/warning/empty views respectively
73+
"""
74+
if not isinstance(mmif_file, (str, Path)):
75+
raise ValueError(
76+
"MMIF file path must be a string or a Path object."
77+
)
78+
79+
with open(mmif_file, "r") as f:
80+
mmif_str = f.read()
81+
82+
data = Mmif(mmif_str)
83+
spec = []
84+
error_views = []
85+
warning_views = []
86+
empty_views = []
87+
88+
for view in data.views:
89+
# Track error, warning, and empty views (mutually exclusive)
90+
if view.has_error():
91+
error_views.append(view.id)
92+
continue
93+
elif view.has_warnings():
94+
warning_views.append(view.id)
95+
continue
96+
elif len(view.annotations) == 0:
97+
empty_views.append(view.id)
98+
continue
99+
100+
app = view.metadata.get("app")
101+
configs = view.metadata.get("appConfiguration", {})
102+
103+
# Get running time string (H:MM:SS.microseconds format)
104+
# Support both new (appProfiling.runningTime) and old (appRunningTime)
105+
running_time = None
106+
if "appProfiling" in view.metadata:
107+
profiling = view.metadata["appProfiling"]
108+
if isinstance(profiling, dict) and "runningTime" in profiling:
109+
running_time = profiling["runningTime"]
110+
elif "appRunningTime" in view.metadata:
111+
running_time = view.metadata["appRunningTime"]
112+
113+
# Support both new (appProfiling.hardware) and old (appRunningHardware)
114+
running_hardware = None
115+
if "appProfiling" in view.metadata:
116+
profiling = view.metadata["appProfiling"]
117+
if isinstance(profiling, dict) and "hardware" in profiling:
118+
running_hardware = profiling["hardware"]
119+
elif "appRunningHardware" in view.metadata:
120+
running_hardware = view.metadata["appRunningHardware"]
121+
122+
# Count annotations and group by type
123+
annotation_count = len(view.annotations)
124+
annotations_by_type = {}
125+
for annotation in view.annotations:
126+
at_type = str(annotation.at_type)
127+
annotations_by_type[at_type] = annotations_by_type.get(
128+
at_type, 0
129+
) + 1
130+
131+
spec.append((
132+
view.id, app, configs, running_time, running_hardware,
133+
annotation_count, annotations_by_type
134+
))
135+
136+
return spec, error_views, warning_views, empty_views
137+
138+
139+
def generate_pipeline_identifier(mmif_file: Union[str, Path]) -> str:
140+
"""
141+
Generate a pipeline identifier string from a MMIF file.
142+
143+
The identifier follows the storage directory structure format:
144+
app_name/version/param_hash/app_name2/version2/param_hash2/...
145+
146+
Uses view.metadata.parameters (raw user-passed values) for hashing
147+
to ensure reproducibility. Views with errors or warnings are excluded
148+
from the identifier; empty views (no annotations) are included.
149+
150+
:param mmif_file: Path to the MMIF file
151+
:return: Pipeline identifier string
152+
"""
153+
if not isinstance(mmif_file, (str, Path)):
154+
raise ValueError(
155+
"MMIF file path must be a string or a Path object."
156+
)
157+
158+
with open(mmif_file, "r") as f:
159+
mmif_str = f.read()
160+
161+
data = Mmif(mmif_str)
162+
segments = []
163+
164+
for view in data.views:
165+
# Skip views with errors or warnings
166+
if view.has_error() or view.has_warnings():
167+
continue
168+
169+
app = view.metadata.get("app")
170+
if app is None:
171+
continue
172+
app_name, app_version = split_appname_appversion(app)
173+
174+
# Use raw parameters for reproducibility
175+
try:
176+
param_dict = view.metadata.parameters
177+
except (KeyError, AttributeError):
178+
param_dict = {}
179+
180+
param_hash = generate_param_hash(param_dict)
181+
182+
# Build segment: app_name/version/hash
183+
name_str = app_name if app_name else "unknown"
184+
version_str = app_version if app_version else "unversioned"
185+
segments.append(f"{name_str}/{version_str}/{param_hash}")
186+
187+
return '/'.join(segments)
188+
189+
190+
def describe_argparser():
191+
"""
192+
Returns two strings: one-line description of the argparser, and
193+
additional material, which will be shown in `clams --help` and
194+
`clams <subcmd> --help`, respectively.
195+
"""
196+
oneliner = (
197+
'provides CLI to describe the pipeline specification from a MMIF '
198+
'file.'
199+
)
200+
additional = textwrap.dedent("""
201+
MMIF describe extracts pipeline information from a MMIF file and outputs
202+
a JSON summary including:
203+
204+
- pipeline_id: unique identifier for the pipeline based on apps, versions,
205+
and parameter hashes (excludes error/warning views)
206+
- stats: annotation counts (total and per-view), counts by annotation type,
207+
and lists of error/warning/empty view IDs
208+
- views: map of view IDs to app configurations and profiling data
209+
210+
Views with errors or warnings are tracked but excluded from the pipeline
211+
identifier and annotation statistics.""")
212+
return oneliner, oneliner + '\n\n' + additional
213+
214+
215+
def prep_argparser(**kwargs):
216+
parser = argparse.ArgumentParser(
217+
description=describe_argparser()[1],
218+
formatter_class=argparse.RawDescriptionHelpFormatter,
219+
**kwargs
220+
)
221+
parser.add_argument(
222+
"MMIF_FILE",
223+
nargs="?",
224+
type=argparse.FileType("r"),
225+
default=None if sys.stdin.isatty() else sys.stdin,
226+
help='input MMIF file path, or STDIN if `-` or not provided.'
227+
)
228+
parser.add_argument(
229+
"-o", "--output",
230+
type=argparse.FileType("w"),
231+
default=sys.stdout,
232+
help='output file path, or STDOUT if not provided.'
233+
)
234+
parser.add_argument(
235+
"-p", "--pretty",
236+
action="store_true",
237+
help="Pretty-print JSON output"
238+
)
239+
return parser
240+
241+
242+
def main(args):
243+
"""
244+
Main entry point for the describe CLI command.
245+
246+
Reads a MMIF file and outputs a JSON summary containing:
247+
- pipeline_id: unique identifier for the pipeline
248+
- stats: view counts, annotation counts (total/per-view/per-type),
249+
and lists of error/warning/empty view IDs
250+
- views: map of view IDs to app configurations and profiling data
251+
252+
:param args: Parsed command-line arguments
253+
"""
254+
# Read MMIF content
255+
mmif_content = args.MMIF_FILE.read()
256+
257+
# For file input, we need to handle the path
258+
# If input is from stdin, create a temp file
259+
import tempfile
260+
with tempfile.NamedTemporaryFile(
261+
mode='w', suffix='.mmif', delete=False
262+
) as tmp:
263+
tmp.write(mmif_content)
264+
tmp_path = tmp.name
265+
266+
try:
267+
spec, error_views, warning_views, empty_views = get_pipeline_specs(
268+
tmp_path
269+
)
270+
pipeline_id = generate_pipeline_identifier(tmp_path)
271+
272+
# Convert to JSON-serializable format and calculate stats
273+
views = {}
274+
annotation_count_stats = {"total": 0}
275+
annotation_count_by_type = {}
276+
277+
for (view_id, app, configs, running_time, running_hardware,
278+
annotation_count, annotations_by_type) in spec:
279+
entry = {
280+
"app": app,
281+
"appConfiguration": configs,
282+
}
283+
# Output in new appProfiling format
284+
if running_time is not None or running_hardware is not None:
285+
profiling = {}
286+
if running_time is not None:
287+
profiling["runningTime"] = running_time
288+
if running_hardware is not None:
289+
profiling["hardware"] = running_hardware
290+
entry["appProfiling"] = profiling
291+
292+
views[view_id] = entry
293+
294+
# Build annotation count stats
295+
annotation_count_stats["total"] += annotation_count
296+
annotation_count_stats[view_id] = annotation_count
297+
298+
# Build annotation count by type stats
299+
for at_type, count in annotations_by_type.items():
300+
if at_type not in annotation_count_by_type:
301+
annotation_count_by_type[at_type] = {"total": 0}
302+
annotation_count_by_type[at_type]["total"] += count
303+
annotation_count_by_type[at_type][view_id] = count
304+
305+
output = {
306+
"pipeline_id": pipeline_id,
307+
"stats": {
308+
"viewCount": len(views),
309+
"errorViews": error_views,
310+
"warningViews": warning_views,
311+
"emptyViews": empty_views,
312+
"annotationCount": annotation_count_stats,
313+
"annotationCountByType": annotation_count_by_type
314+
},
315+
"views": views
316+
}
317+
318+
# Write output
319+
if args.pretty:
320+
json.dump(output, args.output, indent=2)
321+
else:
322+
json.dump(output, args.output)
323+
args.output.write('\n')
324+
finally:
325+
# Clean up temp file
326+
import os
327+
os.unlink(tmp_path)
328+
329+
330+
if __name__ == "__main__":
331+
parser = prep_argparser()
332+
args = parser.parse_args()
333+
main(args)

mmif/utils/cli/rewind.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,14 +76,14 @@ def describe_argparser():
7676
def prep_argparser(**kwargs):
7777
parser = argparse.ArgumentParser(description=describe_argparser()[1],
7878
formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs)
79-
parser.add_argument("IN_MMIF_FILE",
79+
parser.add_argument("MMIF_FILE",
8080
nargs="?", type=argparse.FileType("r"),
8181
default=None if sys.stdin.isatty() else sys.stdin,
8282
help='input MMIF file path, or STDIN if `-` or not provided.')
83-
parser.add_argument("OUT_MMIF_FILE",
84-
nargs="?", type=argparse.FileType("w"),
83+
parser.add_argument("-o", "--output",
84+
type=argparse.FileType("w"),
8585
default=sys.stdout,
86-
help='output MMIF file path, or STDOUT if `-` or not provided.')
86+
help='output file path, or STDOUT if not provided.')
8787
parser.add_argument("-p", '--pretty', action='store_true',
8888
help="Pretty-print rewound MMIF")
8989
parser.add_argument("-n", '--number', default="0", type=int,
@@ -95,7 +95,7 @@ def prep_argparser(**kwargs):
9595

9696

9797
def main(args):
98-
mmif_obj = mmif.Mmif(args.IN_MMIF_FILE.read())
98+
mmif_obj = mmif.Mmif(args.MMIF_FILE.read())
9999

100100
if args.number == 0: # If user doesn't know how many views to rewind, give them choices.
101101
choice = prompt_user(mmif_obj)
@@ -104,7 +104,7 @@ def main(args):
104104
if not isinstance(choice, int) or choice <= 0:
105105
raise ValueError(f"Only can rewind by a positive number of views. Got {choice}.")
106106

107-
args.OUT_MMIF_FILE.write(rewind_mmif(mmif_obj, choice, args.mode == 'view').serialize(pretty=args.pretty))
107+
args.output.write(rewind_mmif(mmif_obj, choice, args.mode == 'view').serialize(pretty=args.pretty))
108108

109109

110110
if __name__ == "__main__":

0 commit comments

Comments
 (0)