diff --git a/app_pack_generator/application.py b/app_pack_generator/application.py index 21ccc39..1554b48 100644 --- a/app_pack_generator/application.py +++ b/app_pack_generator/application.py @@ -4,6 +4,7 @@ import logging import papermill + import jsonschema from tabulate import tabulate @@ -96,6 +97,9 @@ class ApplicationInterface(object): # Free arguments that are papermill parameters not associated with stage arguments: list[ApplicationParameter] = [] + # Keyword value metadata pairs + metadata: dict = {} + class ApplicationNotebook(ApplicationInterface): """Defines a parsed Jupyter Notebook read as a JSON file.""" @@ -110,9 +114,10 @@ def __init__(self, notebook_filename): self.notebook_parameters = [] self.filename = notebook_filename - self.parse_notebook(notebook_filename) + self.parse_notebook_parameters(notebook_filename) + self.parse_notebook_metadata(notebook_filename) - def parse_notebook(self, notebook_filename): + def parse_notebook_parameters(self, notebook_filename): """Parses validate notebook_filename as a valid, existing Jupyter Notebook to ensure no exception is thrown. """ @@ -167,6 +172,29 @@ def parse_notebook(self, notebook_filename): else: self.arguments.append(app_param) + def parse_notebook_metadata(self, notebook_filename, name=None, language=None): + "Use papermill inspection to get a list of keyword/value pairs from a 'metadata' cell" + + nb = papermill.inspection._open_notebook(notebook_filename, parameters=None) + + metadata_cell_idx = papermill.utils.find_first_tagged_cell_index(nb, "metadata") + if metadata_cell_idx < 0: + return + + metadata_cell = nb.cells[metadata_cell_idx] + + kernel_name = papermill.utils.nb_kernel_name(nb, name) + language = papermill.utils.nb_language(nb, language) + + translator = papermill.translators.papermill_translators.find_translator(kernel_name, language) + try: + metadata = translator.inspect(metadata_cell) + except NotImplementedError: + logger.warning(f"Translator for '{language}' language does not support parameter introspection.") + + for nb_param in metadata: + self.metadata[nb_param.name] = eval(nb_param.default) + def parameter_summary(self): headers = [ 'name', 'inferred_type', 'cwl_type', 'default', 'help' ] @@ -180,3 +208,14 @@ def parameter_summary(self): table_data.append(table_row) return tabulate(table_data, headers=headers) + + def metadata_summary(self): + + headers = [ 'name', 'value' ] + + # Build up rows of the table using the header values as the columns + table_data = [] + for key, value in self.metadata.items(): + table_data.append( (key, value) ) + + return tabulate(table_data, headers=headers) diff --git a/app_pack_generator/cwl.py b/app_pack_generator/cwl.py index dd421c7..0837cb3 100644 --- a/app_pack_generator/cwl.py +++ b/app_pack_generator/cwl.py @@ -2,6 +2,10 @@ import re import yaml import logging +from datetime import datetime +from operator import setitem + +from .application import ApplicationNotebook logger = logging.getLogger(__name__) @@ -19,7 +23,7 @@ class CWLError(Exception): class BaseCWL(object): - def __init__(self, application, template_dir=os.path.join(LOCAL_PATH, 'templates')): + def __init__(self, application : ApplicationNotebook, template_dir=os.path.join(LOCAL_PATH, 'templates')): self.app = application self.template_dir = template_dir @@ -39,13 +43,30 @@ def generate_all(self, outdir, **kwargs): class ProcessCWL(BaseCWL): - def __init__(self, application, **kwargs): + def __init__(self, application, repo_info, **kwargs): super().__init__(application, **kwargs) + + # Used to build metadata + self.repo_info = repo_info # Template CWL and descriptor files self.process_cwl = self._read_template( os.path.join(self.template_dir, 'process.cwl')) + @property + def _workflow(self): + + graph = self.process_cwl['$graph'] + search_func = lambda d: "class" in d and d["class"] == "Workflow" + return next(filter(search_func, graph), None) + + @property + def _command_line_tool(self): + + graph = self.process_cwl['$graph'] + search_func = lambda d: "class" in d and d["class"] == "CommandLineTool" + return next(filter(search_func, graph), None) + def generate_all(self, outdir, dockerurl="undefined", **kwargs): """Calls all of the application CWL generators as well as the application descriptor generator. @@ -61,57 +82,135 @@ def _insert_argument_params(self): "Connect non stage in/out arguments to papermill parameters" # Forward the ordinary argument parameters to the process step directly - input_dict = self.process_cwl['inputs'] + workflow_input_dict = self._workflow['inputs'] + workflow_process_in_dict = self._workflow['steps']['process']['in'] + cmd_input_dict = self._command_line_tool['inputs'] + for param in self.app.arguments: - name = param.name + param_name = param.name - input_dict[name] = { + param_def = { 'type': param.cwl_type, 'default': param.default, + 'label': "argument", + 'doc': param.help, } + cmd_input_dict[param_name] = param_def.copy() + workflow_input_dict[param_name] = param_def.copy() + + workflow_process_in_dict[param_name] = param_name + def _insert_input_params(self): - "Connects the special 'input' CWL parameter to the papermill parameter for recieving the stage-in directory location" + "Connects the special 'input' CWL parameter to the papermill parameter for receiving the stage-in directory location" - input_dict = self.process_cwl['inputs'] + workflow_input_dict = self._workflow['inputs'] + workflow_process_in_dict = self._workflow['steps']['process']['in'] + cmd_input_dict = self._command_line_tool['inputs'] # The input Directory is carried as an input from stage_in to process to # make sure that the contents are exposed to the process container # # If we had the stage_in collection filename be an input argument - # to process.cwl then it would be volume mounted in a seperate path + # to process.cwl then it would be volume mounted in a separate path # and we could not assume that the collection file is in the same # directory as the staged in results if self.app.stage_in_param is not None: - input_dict['input'] = 'Directory' + param_name = self.app.stage_in_param.name + + param_def = { + 'type': "Directory", + 'label': "stage-in", + 'doc': self.app.stage_in_param.help, + } - self.process_cwl['arguments'] = self.process_cwl.get('arguments', []) - self.process_cwl['arguments'] += [ + cmd_input_dict[param_name] = param_def.copy() + workflow_input_dict[param_name] = param_def.copy() + + self._command_line_tool['arguments'] = self._command_line_tool.get('arguments', []) + self._command_line_tool['arguments'] += [ '-p', self.app.stage_in_param.name, - f'$(inputs.input.path)' + f'$(inputs.{param_name}.path)' ] - - def _insert_output_params(self): - "Connects the special 'input' CWL parameter to the papermill parameter for recieving the stage-out directory location" - # Connect the stage-out parameter to the name of the file specified in the template as output - # That value should contain a full path by using $(runtime.outdir) - input_dict = self.process_cwl['inputs'] + workflow_process_in_dict[param_name] = param_name + + def _insert_output_params(self): + "Connects the special 'input' CWL parameter to the papermill parameter for receiving the stage-out directory location" if self.app.stage_out_param is not None: - stage_out_process_dir = self.process_cwl['outputs']['output']['outputBinding']['glob'] + stage_out_process_dir = self._command_line_tool['outputs']['outputs_result']['outputBinding']['glob'] if not re.search('runtime.outdir', stage_out_process_dir): raise CWLError(f"The process CWL template outputs/output path needs to contain $(runtime.outdir) in the path") - self.process_cwl['arguments'] = self.process_cwl.get('arguments', []) - self.process_cwl['arguments'] += [ + self._command_line_tool['arguments'] = self._command_line_tool.get('arguments', []) + self._command_line_tool['arguments'] += [ '-p', self.app.stage_out_param.name, stage_out_process_dir ] else: - del self.process_cwl['outputs']['output'] + del self._command_line_tool['outputs']['output'] + + def _insert_metadata(self): + + # Defines overridable metadata values for the CWL with a setter function and default value + # metadata_name: (setter_function, default_value) + default_metadata = { + 'id': ( + lambda v: setitem(self._workflow, 'id', v), + self.repo_info.name + ), + 'doc': ( + lambda v: setitem(self._workflow, 'doc', v), + f"OGC Application for {self.repo_info.name} built from Jupyter notebook: {os.path.basename(self.app.filename)} from source repository: {self.repo_info.source_location}" + ), + 'label': ( + lambda v: setitem(self._workflow, 'label', v), + f"OGC Application for {self.repo_info.name}" + ), + 'author': ( + lambda v: setitem(self.process_cwl["s:author"][0], "s:name", v), + self.repo_info.owner + ), + 'citation': ( + lambda v: setitem(self.process_cwl, "s:citation", v), + self.repo_info.source_location + ), + 'codeRepository': ( + lambda v: setitem(self.process_cwl, "s:codeRepository", v), + self.repo_info.source_location + ), + 'commitHash': ( + lambda v: setitem(self.process_cwl, "s:commitHash", v), + self.repo_info.commit_identifier + ), + 'dateCreated': ( + lambda v: setitem(self.process_cwl, "s:dateCreated", v), + datetime.now().date() + ), + 'version': ( + lambda v: setitem(self.process_cwl, "s:version", v), + "0.1.0" + ), + 'softwareVersion': ( + lambda v: setitem(self.process_cwl, "s:softwareVersion", v), + "0.1.0" + ), + } + + # Process all metadata values with a default value + for keyword, (setter, default_value) in default_metadata.items(): + if keyword in self.app.metadata: + setter(self.app.metadata[keyword]) + else: + setter(default_value) + + # Process additional metadata that is not in the list of default values + for keyword in self.app.metadata: + if keyword not in default_metadata: + self.process_cwl[f"s:{keyword}"] = self.app.metadata[keyword] def generate_process_cwl(self, outdir, dockerurl): """Generates the application CWL. @@ -122,7 +221,7 @@ def generate_process_cwl(self, outdir, dockerurl): os.makedirs(outdir) # Set correct URL for process Docker container - self.process_cwl['requirements']['DockerRequirement']['dockerPull'] = dockerurl + self._command_line_tool['requirements']['DockerRequirement']['dockerPull'] = dockerurl # Forward the ordinary argument parameters to the process step directly self._insert_argument_params() @@ -131,6 +230,9 @@ def generate_process_cwl(self, outdir, dockerurl): self._insert_input_params() self._insert_output_params() + # Add metadata on the source repository + self._insert_metadata() + fname = os.path.join(outdir, 'process.cwl') write_cwl_file(fname, self.process_cwl) return fname diff --git a/app_pack_generator/docker.py b/app_pack_generator/docker.py index 12d6826..c5bcaa5 100644 --- a/app_pack_generator/docker.py +++ b/app_pack_generator/docker.py @@ -79,6 +79,16 @@ def image_reference(self): else: return f"{self.image_repository}:{self.image_tag}" + def image_source(self): + source = self.git_mgr.source_location + + # If the source is from Github thenuse the parsed owner + # and repo name to properly set up package linking + if "github.com" in source: + source = f"https://github.com/{self.git_mgr.owner}/{self.git_mgr.name}" + + return source + def repo2docker(self): """Calls repo2docker on the local git directory to generate the Docker image. @@ -96,8 +106,21 @@ def repo2docker(self): logger.info(f"Building Docker image named {self.image_reference}") # Build initial repo2docker command line arguments - cmd = ['jupyter-repo2docker', '--user-id', '1000', '--user-name', 'jovyan', - '--no-run', '--debug', '--image-name', self.image_reference] + # Do not supply the --user-id argument as it will cause + # permission issues when the CWL is run using + # cwltool with the --no-match-user argument + cmd = ['jupyter-repo2docker', + '--user-name', 'jovyan', + '--no-run', '--debug', + '--image-name', self.image_reference] + + # Add repo source when appropriate + # Used by GHCR to connect a package to a repo + # https://docs.github.com/en/packages/learn-github-packages/connecting-a-repository-to-a-package#connecting-a-repository-to-a-container-image-using-the-command-line + source = self.image_source() + if source is not None: + cmd += [ '--label', + f'org.opencontainers.image.source={source}' ] if self.repo_config is not None: # If the repo2docker config file does not exist inside the repo already, assume it is a URL diff --git a/app_pack_generator/templates/process.cwl b/app_pack_generator/templates/process.cwl index 76db015..00b7b09 100644 --- a/app_pack_generator/templates/process.cwl +++ b/app_pack_generator/templates/process.cwl @@ -1,40 +1,67 @@ #!/usr/bin/env cwl-runner cwlVersion: v1.2 -class: CommandLineTool -baseCommand: - - papermill - - /home/jovyan/process.ipynb - - --cwd - - /home/jovyan - - output_nb.ipynb - - -f - - /tmp/inputs.json - - --log-output - - -k - - python3 -requirements: - DockerRequirement: - dockerPull: marjoluc/hello-world:stable - InlineJavascriptRequirement: {} - ShellCommandRequirement: {} - InitialWorkDirRequirement: - listing: - - entryname: /tmp/inputs.json - entry: $(inputs) - InplaceUpdateRequirement: - inplaceUpdate: true - NetworkAccess: - networkAccess: true -inputs: - # Where the incoming data for the process is placed - input: Directory -outputs: - # Where the process placed outgoing data - output: - outputBinding: - glob: "$(runtime.outdir)" - type: Directory - process_output_nb: - outputBinding: - glob: "$(runtime.outdir)/output_nb.ipynb" - type: File +$graph: +- class: Workflow + label: "" + doc: "" + id: "" + inputs: {} + # Where the incoming data for the process is placed + outputs: + out: + type: Directory + outputSource: process/outputs_result + steps: + process: + run: '#main' + in: {} + out: + - outputs_result +- class: CommandLineTool + id: main + baseCommand: + - papermill + - /home/jovyan/process.ipynb + - output_nb.ipynb + - -f + - /tmp/inputs.json + - --log-output + - -k + - python3 + requirements: + DockerRequirement: + dockerPull: unity-sds/mdps-example-application:latest + ShellCommandRequirement: {} + InitialWorkDirRequirement: + listing: + - entryname: /tmp/inputs.json + entry: $(inputs) + InplaceUpdateRequirement: + inplaceUpdate: true + NetworkAccess: + networkAccess: true + inputs: {} + # Where the incoming data for the process is placed + outputs: + # Where the process placed outgoing data + outputs_result: + outputBinding: + glob: "$(runtime.outdir)" + type: Directory + +s:author: +- class: s:Person + s:name: mdps-app-generator +s:citation: "" +s:codeRepository: "" +s:commitHash: "" +s:dateCreated: "" +s:license: "" +s:softwareVersion: "" +s:version: "" +s:releaseNotes: "" +s:keywords: "" +$namespaces: + s: https://schema.org/ +$schemas: +- https://raw.githubusercontent.com/schemaorg/schemaorg/refs/heads/main/data/releases/9.0/schemaorg-current-http.rdf \ No newline at end of file