@@ -156,7 +156,7 @@ def _get_profile_data(view) -> dict:
156156 # need to convert to milliseconds integer
157157 time_obj = datetime .datetime .strptime (running_time_str , "%H:%M:%S.%f" ).time ()
158158 milliseconds = (time_obj .hour * 3600 + time_obj .minute * 60 + time_obj .second ) * 1000 + time_obj .microsecond // 1000
159- return {"runningTime " : milliseconds }
159+ return {"runningTimeMS " : milliseconds }
160160
161161
162162def describe_single_mmif (mmif_file : Union [str , Path ]) -> dict :
@@ -189,10 +189,9 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
189189 * ``errorViews``: A list of view IDs that reported errors.
190190 * ``warningViews``: A list of view IDs that reported warnings.
191191 * ``emptyViews``: A list of view IDs that contain no annotations.
192- * ``annotationCount``: A dictionary with the ``total`` number of
193- annotations across all app executions.
194192 * ``annotationCountByType``: A dictionary mapping each annotation
195- type to its ``total`` count across all app executions.
193+ type to its count, plus a ``total`` key for the sum of all
194+ annotations across all app executions.
196195 * ``apps``: A list of objects, where each object represents one app
197196 execution. It includes metadata, profiling, and aggregated statistics
198197 for all views generated by that execution. A special entry for views
@@ -247,10 +246,8 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
247246 }
248247 total_annotations_in_exec = sum (execution_ann_counter .values ())
249248 if total_annotations_in_exec > 0 :
250- app_data ['annotationCount' ] = {'total' : total_annotations_in_exec }
251- app_data ['annotationCountByType' ] = {
252- at_type : {'total' : count } for at_type , count in execution_ann_counter .items ()
253- }
249+ app_data ['annotationCountByType' ] = dict (execution_ann_counter )
250+ app_data ['annotationCountByType' ]['total' ] = total_annotations_in_exec
254251 grouped_apps .append (app_data )
255252
256253 # Handle unassigned and problematic views
@@ -277,16 +274,19 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
277274 })
278275
279276 # aggregate total annotation counts
280- total_annotations = 0
281- total_annotations_by_type = defaultdict (lambda : {'total' : 0 })
277+ total_annotations_by_type = Counter ()
282278 for execution in grouped_apps :
283279 # Only aggregate from actual apps, not the special unassigned entry
284280 if execution .get ('app' ) != "http://apps.clams.ai/non-existing-app/v1" :
285- if 'annotationCount' in execution :
286- total_annotations += execution ['annotationCount' ]['total' ]
287281 if 'annotationCountByType' in execution :
288- for at_type , data in execution ['annotationCountByType' ].items ():
289- total_annotations_by_type [at_type ]['total' ] += data ['total' ]
282+ exec_counts = execution ['annotationCountByType' ].copy ()
283+ del exec_counts ['total' ]
284+ total_annotations_by_type .update (Counter (exec_counts ))
285+
286+ final_total_annotations = sum (total_annotations_by_type .values ())
287+ final_annotation_counts = dict (total_annotations_by_type )
288+ if final_total_annotations > 0 :
289+ final_annotation_counts ['total' ] = final_total_annotations
290290
291291 return {
292292 "workflowId" : workflow_id ,
@@ -295,8 +295,7 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
295295 "errorViews" : error_view_ids ,
296296 "warningViews" : warning_view_ids ,
297297 "emptyViews" : empty_view_ids ,
298- "annotationCount" : {"total" : total_annotations },
299- "annotationCountByType" : dict (total_annotations_by_type )
298+ "annotationCountByType" : final_annotation_counts
300299 },
301300 "apps" : grouped_apps
302301 }
@@ -314,25 +313,21 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict:
314313 * ``mmifCountByStatus``: A dictionary summarizing the processing status of
315314 all MMIF files in the collection. It includes:
316315 * ``total``: Total number of MMIF files found.
317- * ``successful``: Number of MMIF files processed without errors or warnings.
316+ * ``successful``: Number of MMIF files processed without errors (may contain warnings) .
318317 * ``withErrors``: Number of MMIF files containing app executions that reported errors.
319318 * ``withWarnings``: Number of MMIF files containing app executions that reported warnings.
320319 * ``invalid``: Number of files that failed to be parsed as valid MMIF.
321- * ``mmifCountByWorkflow``: A dictionary mapping each unique ``workflowId``
322- (from the single MMIF reports) to the ``count`` of MMIF files that share that workflow.
323- * ``appProfilings``: A dictionary summarizing the aggregated performance
324- statistics for each unique app found across the collection. Each entry,
325- keyed by the app's URI, includes:
326- * ``avgRunningTimeMS``: Average running time in milliseconds.
327- * ``minRunningTimeMS``: Minimum running time in milliseconds.
328- * ``maxRunningTimeMS``: Maximum running time in milliseconds.
329- * ``stdevRunningTimeMS``: Standard deviation of running times in milliseconds
330- (0 if only one execution).
320+ * ``workflows``: A list of "workflow" objects found in the "successful" MMIF files (files with errors
321+ are excluded), where each object contains:
322+ * ``workflowId``: The unique identifier for the workflow.
323+ * ``apps``: A list of app objects, each with ``app`` (name+ver identifier),
324+ ``appConfiguration``, and ``appProfiling`` statistics (avg, min, max, stdev running times)
325+ aggregated per workflow.
326+ * ``mmifs``: A list of MMIF file basenames belonging to this workflow.
327+ * ``mmifCount``: The number of MMIF files in this workflow.
331328 * ``annotationCountByType``: A dictionary aggregating annotation counts
332- across the entire collection. It includes:
333- * ``total``: The grand total number of annotations across all MMIF files.
334- * Individual entries keyed by annotation type URI, each showing the
335- ``total`` count for that specific annotation type.
329+ across the entire collection. It includes a ``total`` key for the grand
330+ total, plus integer counts for each individual annotation type.
336331
337332 ---
338333 The docstring above is used to generate help messages for the CLI command.
@@ -342,24 +337,28 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict:
342337 :return: A dictionary containing the summarized collection specification.
343338 """
344339 import statistics
345- from collections import defaultdict
340+ from collections import defaultdict , Counter
346341
347- mmif_files = Path (mmif_dir ).glob ('*.mmif' )
342+ mmif_files = list ( Path (mmif_dir ).glob ('*.mmif' ) )
348343
349344 status_summary = defaultdict (int )
350- status_summary ['total' ] = 0
345+ status_summary ['total' ] = len ( mmif_files )
351346 status_summary ['successful' ] = 0
352347 status_summary ['withErrors' ] = 0
353348 status_summary ['withWarnings' ] = 0
354349 status_summary ['invalid' ] = 0
355350
356- workflow_analysis = defaultdict (lambda : {'count' : 0 })
357- app_profilings_raw = defaultdict (list )
358- annotation_counts = defaultdict (int )
359- annotation_counts ['total' ] = 0
351+ aggregated_counts = Counter ()
352+
353+ workflows_data = defaultdict (lambda : {
354+ 'mmifs' : [],
355+ 'apps' : defaultdict (lambda : {
356+ 'appConfiguration' : None , # Store the first config here
357+ 'execution_times' : []
358+ })
359+ })
360360
361361 for mmif_file in mmif_files :
362- status_summary ['total' ] += 1
363362 try :
364363 single_report = describe_single_mmif (mmif_file )
365364 except Exception as e :
@@ -368,43 +367,75 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict:
368367
369368 if single_report ['stats' ]['errorViews' ]:
370369 status_summary ['withErrors' ] += 1
371- elif single_report ['stats' ]['warningViews' ]:
370+ continue # Exclude from all other stats
371+
372+ # If we get here, the MMIF has no errors and is considered "successful"
373+ status_summary ['successful' ] += 1
374+ if single_report ['stats' ]['warningViews' ]:
372375 status_summary ['withWarnings' ] += 1
373- else :
374- status_summary ['successful' ] += 1
375376
376- # Workflow analysis
377377 wf_id = single_report ['workflowId' ]
378- workflow_analysis [wf_id ]['count' ] += 1
379-
380- # App performance and annotation raw data collection from "apps" list
381- for app_execution in single_report .get ('apps' , []):
382- # app profilings
383- app = app_execution .get ('app' )
384- profiling = app_execution .get ('appProfiling' , {})
385- running_time = profiling .get ('runningTime' )
386- if app and running_time is not None :
387- app_profilings_raw [app ].append (running_time )
388-
389- # annotation counts
390- annotation_counts ['total' ] += app_execution .get ('annotationCount' , {}).get ('total' , 0 )
391- for at_type , data in app_execution .get ('annotationCountByType' , {}).items ():
392- annotation_counts [at_type ] += data .get ('total' , 0 )
393-
394- # Process app performance data
395- profiles = {}
396- for app , execution_times in app_profilings_raw .items ():
397- if execution_times :
398- profiles [app ] = {
399- 'avgRunningTimeMS' : statistics .mean (execution_times ),
400- 'minRunningTimeMS' : min (execution_times ),
401- 'maxRunningTimeMS' : max (execution_times ),
402- 'stdevRunningTimeMS' : statistics .stdev (execution_times ) if len (execution_times ) > 1 else 0
378+ workflows_data [wf_id ]['mmifs' ].append (Path (mmif_file ).name )
379+
380+ # Aggregate annotation counts for successful mmifs
381+ report_counts = single_report ['stats' ].get ('annotationCountByType' , {})
382+ if 'total' in report_counts :
383+ del report_counts ['total' ] # don't add the sub-total to the main counter
384+ aggregated_counts .update (report_counts )
385+
386+ for app_exec in single_report .get ('apps' , []):
387+ app_uri = app_exec .get ('app' )
388+ # skip the special "unassigned" app
389+ if app_uri and app_uri != "http://apps.clams.ai/non-existing-app/v1" :
390+ running_time = app_exec .get ('appProfiling' , {}).get ('runningTime' )
391+ if running_time is not None :
392+ workflows_data [wf_id ]['apps' ][app_uri ]['execution_times' ].append (running_time )
393+
394+ # Store the first non-empty app configuration we find for this app in this workflow
395+ if workflows_data [wf_id ]['apps' ][app_uri ]['appConfiguration' ] is None :
396+ config = app_exec .get ('appConfiguration' , {})
397+ if config :
398+ workflows_data [wf_id ]['apps' ][app_uri ]['appConfiguration' ] = config
399+
400+ # Process collected data into the final output format
401+ final_workflows_list = []
402+ for wf_id , wf_data in sorted (workflows_data .items ()):
403+ workflow_object = {
404+ 'workflowId' : wf_id ,
405+ 'mmifs' : sorted (wf_data ['mmifs' ]),
406+ 'mmifCount' : len (wf_data ['mmifs' ]),
407+ 'apps' : []
408+ }
409+
410+ for app_uri , app_data in sorted (wf_data ['apps' ].items ()):
411+ times = app_data ['execution_times' ]
412+ if times :
413+ profiling_stats = {
414+ 'avgRunningTimeMS' : statistics .mean (times ),
415+ 'minRunningTimeMS' : min (times ),
416+ 'maxRunningTimeMS' : max (times ),
417+ 'stdevRunningTimeMS' : statistics .stdev (times ) if len (times ) > 1 else 0
418+ }
419+ else :
420+ profiling_stats = {}
421+
422+ app_object = {
423+ 'app' : app_uri ,
424+ 'appConfiguration' : app_data ['appConfiguration' ] or {}, # Default to empty dict
425+ 'appProfiling' : profiling_stats
403426 }
427+ workflow_object ['apps' ].append (app_object )
428+
429+ final_workflows_list .append (workflow_object )
430+
431+ # Finalize annotation counts
432+ final_annotation_counts = dict (aggregated_counts )
433+ grand_total = sum (final_annotation_counts .values ())
434+ if grand_total > 0 :
435+ final_annotation_counts ['total' ] = grand_total
404436
405437 return {
406438 'mmifCountByStatus' : dict (status_summary ),
407- 'mmifCountByWorkflow' : {k : v for k , v in sorted (workflow_analysis .items ())},
408- 'appProfilings' : profiles ,
409- 'annotationCountByType' : dict (annotation_counts )
439+ 'workflows' : final_workflows_list ,
440+ 'annotationCountByType' : final_annotation_counts
410441 }
0 commit comments