|
| 1 | +import argparse |
| 2 | +import hashlib |
| 3 | +import json |
| 4 | +import sys |
| 5 | +import textwrap |
| 6 | +from pathlib import Path |
| 7 | +from typing import Union, List, Tuple, Optional |
| 8 | + |
| 9 | +from mmif import Mmif |
| 10 | + |
| 11 | + |
| 12 | +def split_appname_appversion( |
| 13 | + long_app_id: str |
| 14 | +) -> Tuple[Optional[str], Optional[str]]: |
| 15 | + """ |
| 16 | + Split app name and version from a long app identifier. |
| 17 | +
|
| 18 | + Assumes the identifier looks like "uri://APP_DOMAIN/APP_NAME/APP_VERSION" |
| 19 | +
|
| 20 | + :param long_app_id: Full app identifier URI |
| 21 | + :return: Tuple of (app_name, app_version), either may be None if not found |
| 22 | + """ |
| 23 | + app_path = Path(long_app_id).parts |
| 24 | + app_name = app_path[2] if len(app_path) > 2 else None |
| 25 | + app_version = app_path[3] if len(app_path) > 3 else None |
| 26 | + if (app_version is not None and app_name is not None |
| 27 | + and app_name.endswith(app_version)): |
| 28 | + app_name = app_name[:-len(app_version) - 1] |
| 29 | + if app_version == 'unresolvable': |
| 30 | + app_version = None |
| 31 | + return app_name, app_version |
| 32 | + |
| 33 | + |
| 34 | +def generate_param_hash(params: dict) -> str: |
| 35 | + """ |
| 36 | + Generate MD5 hash from a parameter dictionary. |
| 37 | +
|
| 38 | + Parameters are sorted alphabetically, joined as key=value pairs, |
| 39 | + and hashed using MD5. This is not for security purposes, only for |
| 40 | + generating consistent identifiers. |
| 41 | +
|
| 42 | + :param params: Dictionary of parameters |
| 43 | + :return: MD5 hash string (32 hex characters) |
| 44 | + """ |
| 45 | + if not params: |
| 46 | + param_string = "" |
| 47 | + else: |
| 48 | + param_list = ['='.join([k, str(v)]) for k, v in params.items()] |
| 49 | + param_list.sort() |
| 50 | + param_string = ','.join(param_list) |
| 51 | + return hashlib.md5(param_string.encode('utf-8')).hexdigest() |
| 52 | + |
| 53 | + |
| 54 | +def get_pipeline_specs( |
| 55 | + mmif_file: Union[str, Path] |
| 56 | +) -> Tuple[ |
| 57 | + List[Tuple[str, Optional[str], dict, Optional[str], Optional[dict], int, dict]], |
| 58 | + List[str], List[str], List[str] |
| 59 | +]: |
| 60 | + """ |
| 61 | + Read a MMIF file and extract the pipeline specification from it. |
| 62 | +
|
| 63 | + Extracts app configurations, profiling data, and annotation statistics |
| 64 | + for each contentful view. Views with errors, warnings, or no annotations |
| 65 | + are tracked separately. |
| 66 | +
|
| 67 | + :param mmif_file: Path to the MMIF file |
| 68 | + :return: Tuple of (spec_list, error_views, warning_views, empty_views) |
| 69 | + where spec_list contains tuples of (view_id, app_name, configs, |
| 70 | + running_time_ms, running_hardware, annotation_count, |
| 71 | + annotations_by_type) for each contentful view, and the three |
| 72 | + lists contain view IDs for error/warning/empty views respectively |
| 73 | + """ |
| 74 | + if not isinstance(mmif_file, (str, Path)): |
| 75 | + raise ValueError( |
| 76 | + "MMIF file path must be a string or a Path object." |
| 77 | + ) |
| 78 | + |
| 79 | + with open(mmif_file, "r") as f: |
| 80 | + mmif_str = f.read() |
| 81 | + |
| 82 | + data = Mmif(mmif_str) |
| 83 | + spec = [] |
| 84 | + error_views = [] |
| 85 | + warning_views = [] |
| 86 | + empty_views = [] |
| 87 | + |
| 88 | + for view in data.views: |
| 89 | + # Track error, warning, and empty views (mutually exclusive) |
| 90 | + if view.has_error(): |
| 91 | + error_views.append(view.id) |
| 92 | + continue |
| 93 | + elif view.has_warnings(): |
| 94 | + warning_views.append(view.id) |
| 95 | + continue |
| 96 | + elif len(view.annotations) == 0: |
| 97 | + empty_views.append(view.id) |
| 98 | + continue |
| 99 | + |
| 100 | + app = view.metadata.get("app") |
| 101 | + configs = view.metadata.get("appConfiguration", {}) |
| 102 | + |
| 103 | + # Get running time string (H:MM:SS.microseconds format) |
| 104 | + # Support both new (appProfiling.runningTime) and old (appRunningTime) |
| 105 | + running_time = None |
| 106 | + if "appProfiling" in view.metadata: |
| 107 | + profiling = view.metadata["appProfiling"] |
| 108 | + if isinstance(profiling, dict) and "runningTime" in profiling: |
| 109 | + running_time = profiling["runningTime"] |
| 110 | + elif "appRunningTime" in view.metadata: |
| 111 | + running_time = view.metadata["appRunningTime"] |
| 112 | + |
| 113 | + # Support both new (appProfiling.hardware) and old (appRunningHardware) |
| 114 | + running_hardware = None |
| 115 | + if "appProfiling" in view.metadata: |
| 116 | + profiling = view.metadata["appProfiling"] |
| 117 | + if isinstance(profiling, dict) and "hardware" in profiling: |
| 118 | + running_hardware = profiling["hardware"] |
| 119 | + elif "appRunningHardware" in view.metadata: |
| 120 | + running_hardware = view.metadata["appRunningHardware"] |
| 121 | + |
| 122 | + # Count annotations and group by type |
| 123 | + annotation_count = len(view.annotations) |
| 124 | + annotations_by_type = {} |
| 125 | + for annotation in view.annotations: |
| 126 | + at_type = str(annotation.at_type) |
| 127 | + annotations_by_type[at_type] = annotations_by_type.get( |
| 128 | + at_type, 0 |
| 129 | + ) + 1 |
| 130 | + |
| 131 | + spec.append(( |
| 132 | + view.id, app, configs, running_time, running_hardware, |
| 133 | + annotation_count, annotations_by_type |
| 134 | + )) |
| 135 | + |
| 136 | + return spec, error_views, warning_views, empty_views |
| 137 | + |
| 138 | + |
| 139 | +def generate_pipeline_identifier(mmif_file: Union[str, Path]) -> str: |
| 140 | + """ |
| 141 | + Generate a pipeline identifier string from a MMIF file. |
| 142 | +
|
| 143 | + The identifier follows the storage directory structure format: |
| 144 | + app_name/version/param_hash/app_name2/version2/param_hash2/... |
| 145 | +
|
| 146 | + Uses view.metadata.parameters (raw user-passed values) for hashing |
| 147 | + to ensure reproducibility. Views with errors or warnings are excluded |
| 148 | + from the identifier; empty views (no annotations) are included. |
| 149 | +
|
| 150 | + :param mmif_file: Path to the MMIF file |
| 151 | + :return: Pipeline identifier string |
| 152 | + """ |
| 153 | + if not isinstance(mmif_file, (str, Path)): |
| 154 | + raise ValueError( |
| 155 | + "MMIF file path must be a string or a Path object." |
| 156 | + ) |
| 157 | + |
| 158 | + with open(mmif_file, "r") as f: |
| 159 | + mmif_str = f.read() |
| 160 | + |
| 161 | + data = Mmif(mmif_str) |
| 162 | + segments = [] |
| 163 | + |
| 164 | + for view in data.views: |
| 165 | + # Skip views with errors or warnings |
| 166 | + if view.has_error() or view.has_warnings(): |
| 167 | + continue |
| 168 | + |
| 169 | + app = view.metadata.get("app") |
| 170 | + if app is None: |
| 171 | + continue |
| 172 | + app_name, app_version = split_appname_appversion(app) |
| 173 | + |
| 174 | + # Use raw parameters for reproducibility |
| 175 | + try: |
| 176 | + param_dict = view.metadata.parameters |
| 177 | + except (KeyError, AttributeError): |
| 178 | + param_dict = {} |
| 179 | + |
| 180 | + param_hash = generate_param_hash(param_dict) |
| 181 | + |
| 182 | + # Build segment: app_name/version/hash |
| 183 | + name_str = app_name if app_name else "unknown" |
| 184 | + version_str = app_version if app_version else "unversioned" |
| 185 | + segments.append(f"{name_str}/{version_str}/{param_hash}") |
| 186 | + |
| 187 | + return '/'.join(segments) |
| 188 | + |
| 189 | + |
| 190 | +def describe_argparser(): |
| 191 | + """ |
| 192 | + Returns two strings: one-line description of the argparser, and |
| 193 | + additional material, which will be shown in `clams --help` and |
| 194 | + `clams <subcmd> --help`, respectively. |
| 195 | + """ |
| 196 | + oneliner = ( |
| 197 | + 'provides CLI to describe the pipeline specification from a MMIF ' |
| 198 | + 'file.' |
| 199 | + ) |
| 200 | + additional = textwrap.dedent(""" |
| 201 | + MMIF describe extracts pipeline information from a MMIF file and outputs |
| 202 | + a JSON summary including: |
| 203 | +
|
| 204 | + - pipeline_id: unique identifier for the pipeline based on apps, versions, |
| 205 | + and parameter hashes (excludes error/warning views) |
| 206 | + - stats: annotation counts (total and per-view), counts by annotation type, |
| 207 | + and lists of error/warning/empty view IDs |
| 208 | + - views: map of view IDs to app configurations and profiling data |
| 209 | +
|
| 210 | + Views with errors or warnings are tracked but excluded from the pipeline |
| 211 | + identifier and annotation statistics.""") |
| 212 | + return oneliner, oneliner + '\n\n' + additional |
| 213 | + |
| 214 | + |
| 215 | +def prep_argparser(**kwargs): |
| 216 | + parser = argparse.ArgumentParser( |
| 217 | + description=describe_argparser()[1], |
| 218 | + formatter_class=argparse.RawDescriptionHelpFormatter, |
| 219 | + **kwargs |
| 220 | + ) |
| 221 | + parser.add_argument( |
| 222 | + "MMIF_FILE", |
| 223 | + nargs="?", |
| 224 | + type=argparse.FileType("r"), |
| 225 | + default=None if sys.stdin.isatty() else sys.stdin, |
| 226 | + help='input MMIF file path, or STDIN if `-` or not provided.' |
| 227 | + ) |
| 228 | + parser.add_argument( |
| 229 | + "-o", "--output", |
| 230 | + type=argparse.FileType("w"), |
| 231 | + default=sys.stdout, |
| 232 | + help='output file path, or STDOUT if not provided.' |
| 233 | + ) |
| 234 | + parser.add_argument( |
| 235 | + "-p", "--pretty", |
| 236 | + action="store_true", |
| 237 | + help="Pretty-print JSON output" |
| 238 | + ) |
| 239 | + return parser |
| 240 | + |
| 241 | + |
| 242 | +def main(args): |
| 243 | + """ |
| 244 | + Main entry point for the describe CLI command. |
| 245 | +
|
| 246 | + Reads a MMIF file and outputs a JSON summary containing: |
| 247 | + - pipeline_id: unique identifier for the pipeline |
| 248 | + - stats: view counts, annotation counts (total/per-view/per-type), |
| 249 | + and lists of error/warning/empty view IDs |
| 250 | + - views: map of view IDs to app configurations and profiling data |
| 251 | +
|
| 252 | + :param args: Parsed command-line arguments |
| 253 | + """ |
| 254 | + # Read MMIF content |
| 255 | + mmif_content = args.MMIF_FILE.read() |
| 256 | + |
| 257 | + # For file input, we need to handle the path |
| 258 | + # If input is from stdin, create a temp file |
| 259 | + import tempfile |
| 260 | + with tempfile.NamedTemporaryFile( |
| 261 | + mode='w', suffix='.mmif', delete=False |
| 262 | + ) as tmp: |
| 263 | + tmp.write(mmif_content) |
| 264 | + tmp_path = tmp.name |
| 265 | + |
| 266 | + try: |
| 267 | + spec, error_views, warning_views, empty_views = get_pipeline_specs( |
| 268 | + tmp_path |
| 269 | + ) |
| 270 | + pipeline_id = generate_pipeline_identifier(tmp_path) |
| 271 | + |
| 272 | + # Convert to JSON-serializable format and calculate stats |
| 273 | + views = {} |
| 274 | + annotation_count_stats = {"total": 0} |
| 275 | + annotation_count_by_type = {} |
| 276 | + |
| 277 | + for (view_id, app, configs, running_time, running_hardware, |
| 278 | + annotation_count, annotations_by_type) in spec: |
| 279 | + entry = { |
| 280 | + "app": app, |
| 281 | + "appConfiguration": configs, |
| 282 | + } |
| 283 | + # Output in new appProfiling format |
| 284 | + if running_time is not None or running_hardware is not None: |
| 285 | + profiling = {} |
| 286 | + if running_time is not None: |
| 287 | + profiling["runningTime"] = running_time |
| 288 | + if running_hardware is not None: |
| 289 | + profiling["hardware"] = running_hardware |
| 290 | + entry["appProfiling"] = profiling |
| 291 | + |
| 292 | + views[view_id] = entry |
| 293 | + |
| 294 | + # Build annotation count stats |
| 295 | + annotation_count_stats["total"] += annotation_count |
| 296 | + annotation_count_stats[view_id] = annotation_count |
| 297 | + |
| 298 | + # Build annotation count by type stats |
| 299 | + for at_type, count in annotations_by_type.items(): |
| 300 | + if at_type not in annotation_count_by_type: |
| 301 | + annotation_count_by_type[at_type] = {"total": 0} |
| 302 | + annotation_count_by_type[at_type]["total"] += count |
| 303 | + annotation_count_by_type[at_type][view_id] = count |
| 304 | + |
| 305 | + output = { |
| 306 | + "pipeline_id": pipeline_id, |
| 307 | + "stats": { |
| 308 | + "viewCount": len(views), |
| 309 | + "errorViews": error_views, |
| 310 | + "warningViews": warning_views, |
| 311 | + "emptyViews": empty_views, |
| 312 | + "annotationCount": annotation_count_stats, |
| 313 | + "annotationCountByType": annotation_count_by_type |
| 314 | + }, |
| 315 | + "views": views |
| 316 | + } |
| 317 | + |
| 318 | + # Write output |
| 319 | + if args.pretty: |
| 320 | + json.dump(output, args.output, indent=2) |
| 321 | + else: |
| 322 | + json.dump(output, args.output) |
| 323 | + args.output.write('\n') |
| 324 | + finally: |
| 325 | + # Clean up temp file |
| 326 | + import os |
| 327 | + os.unlink(tmp_path) |
| 328 | + |
| 329 | + |
| 330 | +if __name__ == "__main__": |
| 331 | + parser = prep_argparser() |
| 332 | + args = parser.parse_args() |
| 333 | + main(args) |
0 commit comments