Skip to content

Commit 3044119

Browse files
committed
refined mmif describers for better data presentation
1 parent d9c898c commit 3044119

2 files changed

Lines changed: 104 additions & 71 deletions

File tree

mmif/utils/cli/describe.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
from mmif.utils.workflow_helper import generate_workflow_identifier, describe_single_mmif, \
99
describe_mmif_collection
10+
# gen_param_hash is imported for backward compatibility
11+
from mmif.utils.workflow_helper import generate_param_hash
1012

1113

1214
def get_pipeline_specs(mmif_file: Union[str, Path]):

mmif/utils/workflow_helper.py

Lines changed: 102 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def _get_profile_data(view) -> dict:
156156
# need to convert to milliseconds integer
157157
time_obj = datetime.datetime.strptime(running_time_str, "%H:%M:%S.%f").time()
158158
milliseconds = (time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second) * 1000 + time_obj.microsecond // 1000
159-
return {"runningTime": milliseconds}
159+
return {"runningTimeMS": milliseconds}
160160

161161

162162
def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
@@ -189,10 +189,9 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
189189
* ``errorViews``: A list of view IDs that reported errors.
190190
* ``warningViews``: A list of view IDs that reported warnings.
191191
* ``emptyViews``: A list of view IDs that contain no annotations.
192-
* ``annotationCount``: A dictionary with the ``total`` number of
193-
annotations across all app executions.
194192
* ``annotationCountByType``: A dictionary mapping each annotation
195-
type to its ``total`` count across all app executions.
193+
type to its count, plus a ``total`` key for the sum of all
194+
annotations across all app executions.
196195
* ``apps``: A list of objects, where each object represents one app
197196
execution. It includes metadata, profiling, and aggregated statistics
198197
for all views generated by that execution. A special entry for views
@@ -247,10 +246,8 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
247246
}
248247
total_annotations_in_exec = sum(execution_ann_counter.values())
249248
if total_annotations_in_exec > 0:
250-
app_data['annotationCount'] = {'total': total_annotations_in_exec}
251-
app_data['annotationCountByType'] = {
252-
at_type: {'total': count} for at_type, count in execution_ann_counter.items()
253-
}
249+
app_data['annotationCountByType'] = dict(execution_ann_counter)
250+
app_data['annotationCountByType']['total'] = total_annotations_in_exec
254251
grouped_apps.append(app_data)
255252

256253
# Handle unassigned and problematic views
@@ -277,16 +274,19 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
277274
})
278275

279276
# aggregate total annotation counts
280-
total_annotations = 0
281-
total_annotations_by_type = defaultdict(lambda: {'total': 0})
277+
total_annotations_by_type = Counter()
282278
for execution in grouped_apps:
283279
# Only aggregate from actual apps, not the special unassigned entry
284280
if execution.get('app') != "http://apps.clams.ai/non-existing-app/v1":
285-
if 'annotationCount' in execution:
286-
total_annotations += execution['annotationCount']['total']
287281
if 'annotationCountByType' in execution:
288-
for at_type, data in execution['annotationCountByType'].items():
289-
total_annotations_by_type[at_type]['total'] += data['total']
282+
exec_counts = execution['annotationCountByType'].copy()
283+
del exec_counts['total']
284+
total_annotations_by_type.update(Counter(exec_counts))
285+
286+
final_total_annotations = sum(total_annotations_by_type.values())
287+
final_annotation_counts = dict(total_annotations_by_type)
288+
if final_total_annotations > 0:
289+
final_annotation_counts['total'] = final_total_annotations
290290

291291
return {
292292
"workflowId": workflow_id,
@@ -295,8 +295,7 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
295295
"errorViews": error_view_ids,
296296
"warningViews": warning_view_ids,
297297
"emptyViews": empty_view_ids,
298-
"annotationCount": {"total": total_annotations},
299-
"annotationCountByType": dict(total_annotations_by_type)
298+
"annotationCountByType": final_annotation_counts
300299
},
301300
"apps": grouped_apps
302301
}
@@ -314,25 +313,21 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict:
314313
* ``mmifCountByStatus``: A dictionary summarizing the processing status of
315314
all MMIF files in the collection. It includes:
316315
* ``total``: Total number of MMIF files found.
317-
* ``successful``: Number of MMIF files processed without errors or warnings.
316+
* ``successful``: Number of MMIF files processed without errors (may contain warnings).
318317
* ``withErrors``: Number of MMIF files containing app executions that reported errors.
319318
* ``withWarnings``: Number of MMIF files containing app executions that reported warnings.
320319
* ``invalid``: Number of files that failed to be parsed as valid MMIF.
321-
* ``mmifCountByWorkflow``: A dictionary mapping each unique ``workflowId``
322-
(from the single MMIF reports) to the ``count`` of MMIF files that share that workflow.
323-
* ``appProfilings``: A dictionary summarizing the aggregated performance
324-
statistics for each unique app found across the collection. Each entry,
325-
keyed by the app's URI, includes:
326-
* ``avgRunningTimeMS``: Average running time in milliseconds.
327-
* ``minRunningTimeMS``: Minimum running time in milliseconds.
328-
* ``maxRunningTimeMS``: Maximum running time in milliseconds.
329-
* ``stdevRunningTimeMS``: Standard deviation of running times in milliseconds
330-
(0 if only one execution).
320+
* ``workflows``: A list of "workflow" objects found in the "successful" MMIF files (files with errors
321+
are excluded), where each object contains:
322+
* ``workflowId``: The unique identifier for the workflow.
323+
* ``apps``: A list of app objects, each with ``app`` (name+ver identifier),
324+
``appConfiguration``, and ``appProfiling`` statistics (avg, min, max, stdev running times)
325+
aggregated per workflow.
326+
* ``mmifs``: A list of MMIF file basenames belonging to this workflow.
327+
* ``mmifCount``: The number of MMIF files in this workflow.
331328
* ``annotationCountByType``: A dictionary aggregating annotation counts
332-
across the entire collection. It includes:
333-
* ``total``: The grand total number of annotations across all MMIF files.
334-
* Individual entries keyed by annotation type URI, each showing the
335-
``total`` count for that specific annotation type.
329+
across the entire collection. It includes a ``total`` key for the grand
330+
total, plus integer counts for each individual annotation type.
336331
337332
---
338333
The docstring above is used to generate help messages for the CLI command.
@@ -342,24 +337,28 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict:
342337
:return: A dictionary containing the summarized collection specification.
343338
"""
344339
import statistics
345-
from collections import defaultdict
340+
from collections import defaultdict, Counter
346341

347-
mmif_files = Path(mmif_dir).glob('*.mmif')
342+
mmif_files = list(Path(mmif_dir).glob('*.mmif'))
348343

349344
status_summary = defaultdict(int)
350-
status_summary['total'] = 0
345+
status_summary['total'] = len(mmif_files)
351346
status_summary['successful'] = 0
352347
status_summary['withErrors'] = 0
353348
status_summary['withWarnings'] = 0
354349
status_summary['invalid'] = 0
355350

356-
workflow_analysis = defaultdict(lambda: {'count': 0})
357-
app_profilings_raw = defaultdict(list)
358-
annotation_counts = defaultdict(int)
359-
annotation_counts['total'] = 0
351+
aggregated_counts = Counter()
352+
353+
workflows_data = defaultdict(lambda: {
354+
'mmifs': [],
355+
'apps': defaultdict(lambda: {
356+
'appConfiguration': None, # Store the first config here
357+
'execution_times': []
358+
})
359+
})
360360

361361
for mmif_file in mmif_files:
362-
status_summary['total'] += 1
363362
try:
364363
single_report = describe_single_mmif(mmif_file)
365364
except Exception as e:
@@ -368,43 +367,75 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict:
368367

369368
if single_report['stats']['errorViews']:
370369
status_summary['withErrors'] += 1
371-
elif single_report['stats']['warningViews']:
370+
continue # Exclude from all other stats
371+
372+
# If we get here, the MMIF has no errors and is considered "successful"
373+
status_summary['successful'] += 1
374+
if single_report['stats']['warningViews']:
372375
status_summary['withWarnings'] += 1
373-
else:
374-
status_summary['successful'] += 1
375376

376-
# Workflow analysis
377377
wf_id = single_report['workflowId']
378-
workflow_analysis[wf_id]['count'] += 1
379-
380-
# App performance and annotation raw data collection from "apps" list
381-
for app_execution in single_report.get('apps', []):
382-
# app profilings
383-
app = app_execution.get('app')
384-
profiling = app_execution.get('appProfiling', {})
385-
running_time = profiling.get('runningTime')
386-
if app and running_time is not None:
387-
app_profilings_raw[app].append(running_time)
388-
389-
# annotation counts
390-
annotation_counts['total'] += app_execution.get('annotationCount', {}).get('total', 0)
391-
for at_type, data in app_execution.get('annotationCountByType', {}).items():
392-
annotation_counts[at_type] += data.get('total', 0)
393-
394-
# Process app performance data
395-
profiles = {}
396-
for app, execution_times in app_profilings_raw.items():
397-
if execution_times:
398-
profiles[app] = {
399-
'avgRunningTimeMS': statistics.mean(execution_times),
400-
'minRunningTimeMS': min(execution_times),
401-
'maxRunningTimeMS': max(execution_times),
402-
'stdevRunningTimeMS': statistics.stdev(execution_times) if len(execution_times) > 1 else 0
378+
workflows_data[wf_id]['mmifs'].append(Path(mmif_file).name)
379+
380+
# Aggregate annotation counts for successful mmifs
381+
report_counts = single_report['stats'].get('annotationCountByType', {})
382+
if 'total' in report_counts:
383+
del report_counts['total'] # don't add the sub-total to the main counter
384+
aggregated_counts.update(report_counts)
385+
386+
for app_exec in single_report.get('apps', []):
387+
app_uri = app_exec.get('app')
388+
# skip the special "unassigned" app
389+
if app_uri and app_uri != "http://apps.clams.ai/non-existing-app/v1":
390+
running_time = app_exec.get('appProfiling', {}).get('runningTime')
391+
if running_time is not None:
392+
workflows_data[wf_id]['apps'][app_uri]['execution_times'].append(running_time)
393+
394+
# Store the first non-empty app configuration we find for this app in this workflow
395+
if workflows_data[wf_id]['apps'][app_uri]['appConfiguration'] is None:
396+
config = app_exec.get('appConfiguration', {})
397+
if config:
398+
workflows_data[wf_id]['apps'][app_uri]['appConfiguration'] = config
399+
400+
# Process collected data into the final output format
401+
final_workflows_list = []
402+
for wf_id, wf_data in sorted(workflows_data.items()):
403+
workflow_object = {
404+
'workflowId': wf_id,
405+
'mmifs': sorted(wf_data['mmifs']),
406+
'mmifCount': len(wf_data['mmifs']),
407+
'apps': []
408+
}
409+
410+
for app_uri, app_data in sorted(wf_data['apps'].items()):
411+
times = app_data['execution_times']
412+
if times:
413+
profiling_stats = {
414+
'avgRunningTimeMS': statistics.mean(times),
415+
'minRunningTimeMS': min(times),
416+
'maxRunningTimeMS': max(times),
417+
'stdevRunningTimeMS': statistics.stdev(times) if len(times) > 1 else 0
418+
}
419+
else:
420+
profiling_stats = {}
421+
422+
app_object = {
423+
'app': app_uri,
424+
'appConfiguration': app_data['appConfiguration'] or {}, # Default to empty dict
425+
'appProfiling': profiling_stats
403426
}
427+
workflow_object['apps'].append(app_object)
428+
429+
final_workflows_list.append(workflow_object)
430+
431+
# Finalize annotation counts
432+
final_annotation_counts = dict(aggregated_counts)
433+
grand_total = sum(final_annotation_counts.values())
434+
if grand_total > 0:
435+
final_annotation_counts['total'] = grand_total
404436

405437
return {
406438
'mmifCountByStatus': dict(status_summary),
407-
'mmifCountByWorkflow': {k: v for k, v in sorted(workflow_analysis.items())},
408-
'appProfilings': profiles,
409-
'annotationCountByType': dict(annotation_counts)
439+
'workflows': final_workflows_list,
440+
'annotationCountByType': final_annotation_counts
410441
}

0 commit comments

Comments
 (0)