Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions app_pack_generator/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging

import papermill

import jsonschema
from tabulate import tabulate

Expand Down Expand Up @@ -96,6 +97,9 @@ class ApplicationInterface(object):
# Free arguments that are papermill parameters not associated with stage
arguments: list[ApplicationParameter] = []

# Keyword value metadata pairs
metadata: dict = {}

class ApplicationNotebook(ApplicationInterface):
"""Defines a parsed Jupyter Notebook read as a JSON file."""

Expand All @@ -110,9 +114,10 @@ def __init__(self, notebook_filename):
self.notebook_parameters = []

self.filename = notebook_filename
self.parse_notebook(notebook_filename)
self.parse_notebook_parameters(notebook_filename)
self.parse_notebook_metadata(notebook_filename)

def parse_notebook(self, notebook_filename):
def parse_notebook_parameters(self, notebook_filename):
"""Parses validate notebook_filename as a valid, existing Jupyter Notebook to
ensure no exception is thrown.
"""
Expand Down Expand Up @@ -167,6 +172,29 @@ def parse_notebook(self, notebook_filename):
else:
self.arguments.append(app_param)

def parse_notebook_metadata(self, notebook_filename, name=None, language=None):
"Use papermill inspection to get a list of keyword/value pairs from a 'metadata' cell"

nb = papermill.inspection._open_notebook(notebook_filename, parameters=None)

metadata_cell_idx = papermill.utils.find_first_tagged_cell_index(nb, "metadata")
if metadata_cell_idx < 0:
return

metadata_cell = nb.cells[metadata_cell_idx]

kernel_name = papermill.utils.nb_kernel_name(nb, name)
language = papermill.utils.nb_language(nb, language)

translator = papermill.translators.papermill_translators.find_translator(kernel_name, language)
try:
metadata = translator.inspect(metadata_cell)
except NotImplementedError:
logger.warning(f"Translator for '{language}' language does not support parameter introspection.")

for nb_param in metadata:
self.metadata[nb_param.name] = eval(nb_param.default)

def parameter_summary(self):

headers = [ 'name', 'inferred_type', 'cwl_type', 'default', 'help' ]
Expand All @@ -180,3 +208,14 @@ def parameter_summary(self):
table_data.append(table_row)

return tabulate(table_data, headers=headers)

def metadata_summary(self):

headers = [ 'name', 'value' ]

# Build up rows of the table using the header values as the columns
table_data = []
for key, value in self.metadata.items():
table_data.append( (key, value) )

return tabulate(table_data, headers=headers)
148 changes: 125 additions & 23 deletions app_pack_generator/cwl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
import re
import yaml
import logging
from datetime import datetime
from operator import setitem

from .application import ApplicationNotebook

logger = logging.getLogger(__name__)

Expand All @@ -19,7 +23,7 @@ class CWLError(Exception):

class BaseCWL(object):

def __init__(self, application, template_dir=os.path.join(LOCAL_PATH, 'templates')):
def __init__(self, application : ApplicationNotebook, template_dir=os.path.join(LOCAL_PATH, 'templates')):

self.app = application
self.template_dir = template_dir
Expand All @@ -39,13 +43,30 @@ def generate_all(self, outdir, **kwargs):

class ProcessCWL(BaseCWL):

def __init__(self, application, **kwargs):
def __init__(self, application, repo_info, **kwargs):

super().__init__(application, **kwargs)

# Used to build metadata
self.repo_info = repo_info

# Template CWL and descriptor files
self.process_cwl = self._read_template( os.path.join(self.template_dir, 'process.cwl'))

@property
def _workflow(self):

graph = self.process_cwl['$graph']
search_func = lambda d: "class" in d and d["class"] == "Workflow"
return next(filter(search_func, graph), None)

@property
def _command_line_tool(self):

graph = self.process_cwl['$graph']
search_func = lambda d: "class" in d and d["class"] == "CommandLineTool"
return next(filter(search_func, graph), None)

def generate_all(self, outdir, dockerurl="undefined", **kwargs):
"""Calls all of the application CWL generators as well as the application descriptor generator.

Expand All @@ -61,57 +82,135 @@ def _insert_argument_params(self):
"Connect non stage in/out arguments to papermill parameters"

# Forward the ordinary argument parameters to the process step directly
input_dict = self.process_cwl['inputs']
workflow_input_dict = self._workflow['inputs']
workflow_process_in_dict = self._workflow['steps']['process']['in']
cmd_input_dict = self._command_line_tool['inputs']

for param in self.app.arguments:
name = param.name
param_name = param.name

input_dict[name] = {
param_def = {
'type': param.cwl_type,
'default': param.default,
'label': "argument",
'doc': param.help,
}

cmd_input_dict[param_name] = param_def.copy()
workflow_input_dict[param_name] = param_def.copy()

workflow_process_in_dict[param_name] = param_name

def _insert_input_params(self):
"Connects the special 'input' CWL parameter to the papermill parameter for recieving the stage-in directory location"
"Connects the special 'input' CWL parameter to the papermill parameter for receiving the stage-in directory location"

input_dict = self.process_cwl['inputs']
workflow_input_dict = self._workflow['inputs']
workflow_process_in_dict = self._workflow['steps']['process']['in']
cmd_input_dict = self._command_line_tool['inputs']

# The input Directory is carried as an input from stage_in to process to
# make sure that the contents are exposed to the process container
#
# If we had the stage_in collection filename be an input argument
# to process.cwl then it would be volume mounted in a seperate path
# to process.cwl then it would be volume mounted in a separate path
# and we could not assume that the collection file is in the same
# directory as the staged in results
if self.app.stage_in_param is not None:
input_dict['input'] = 'Directory'
param_name = self.app.stage_in_param.name

param_def = {
'type': "Directory",
'label': "stage-in",
'doc': self.app.stage_in_param.help,
}

self.process_cwl['arguments'] = self.process_cwl.get('arguments', [])
self.process_cwl['arguments'] += [
cmd_input_dict[param_name] = param_def.copy()
workflow_input_dict[param_name] = param_def.copy()

self._command_line_tool['arguments'] = self._command_line_tool.get('arguments', [])
self._command_line_tool['arguments'] += [
'-p', self.app.stage_in_param.name,
f'$(inputs.input.path)'
f'$(inputs.{param_name}.path)'
]

def _insert_output_params(self):
"Connects the special 'input' CWL parameter to the papermill parameter for recieving the stage-out directory location"

# Connect the stage-out parameter to the name of the file specified in the template as output
# That value should contain a full path by using $(runtime.outdir)
input_dict = self.process_cwl['inputs']
workflow_process_in_dict[param_name] = param_name

def _insert_output_params(self):
"Connects the special 'input' CWL parameter to the papermill parameter for receiving the stage-out directory location"

if self.app.stage_out_param is not None:
stage_out_process_dir = self.process_cwl['outputs']['output']['outputBinding']['glob']
stage_out_process_dir = self._command_line_tool['outputs']['outputs_result']['outputBinding']['glob']

if not re.search('runtime.outdir', stage_out_process_dir):
raise CWLError(f"The process CWL template outputs/output path needs to contain $(runtime.outdir) in the path")

self.process_cwl['arguments'] = self.process_cwl.get('arguments', [])
self.process_cwl['arguments'] += [
self._command_line_tool['arguments'] = self._command_line_tool.get('arguments', [])
self._command_line_tool['arguments'] += [
'-p', self.app.stage_out_param.name,
stage_out_process_dir
]

else:
del self.process_cwl['outputs']['output']
del self._command_line_tool['outputs']['output']

def _insert_metadata(self):

# Defines overridable metadata values for the CWL with a setter function and default value
# metadata_name: (setter_function, default_value)
default_metadata = {
'id': (
lambda v: setitem(self._workflow, 'id', v),
self.repo_info.name
),
'doc': (
lambda v: setitem(self._workflow, 'doc', v),
f"OGC Application for {self.repo_info.name} built from Jupyter notebook: {os.path.basename(self.app.filename)} from source repository: {self.repo_info.source_location}"
),
'label': (
lambda v: setitem(self._workflow, 'label', v),
f"OGC Application for {self.repo_info.name}"
),
'author': (
lambda v: setitem(self.process_cwl["s:author"][0], "s:name", v),
self.repo_info.owner
),
'citation': (
lambda v: setitem(self.process_cwl, "s:citation", v),
self.repo_info.source_location
),
'codeRepository': (
lambda v: setitem(self.process_cwl, "s:codeRepository", v),
self.repo_info.source_location
),
'commitHash': (
lambda v: setitem(self.process_cwl, "s:commitHash", v),
self.repo_info.commit_identifier
),
'dateCreated': (
lambda v: setitem(self.process_cwl, "s:dateCreated", v),
datetime.now().date()
),
'version': (
lambda v: setitem(self.process_cwl, "s:version", v),
"0.1.0"
),
'softwareVersion': (
lambda v: setitem(self.process_cwl, "s:softwareVersion", v),
"0.1.0"
),
}

# Process all metadata values with a default value
for keyword, (setter, default_value) in default_metadata.items():
if keyword in self.app.metadata:
setter(self.app.metadata[keyword])
else:
setter(default_value)

# Process additional metadata that is not in the list of default values
for keyword in self.app.metadata:
if keyword not in default_metadata:
self.process_cwl[f"s:{keyword}"] = self.app.metadata[keyword]

def generate_process_cwl(self, outdir, dockerurl):
"""Generates the application CWL.
Expand All @@ -122,7 +221,7 @@ def generate_process_cwl(self, outdir, dockerurl):
os.makedirs(outdir)

# Set correct URL for process Docker container
self.process_cwl['requirements']['DockerRequirement']['dockerPull'] = dockerurl
self._command_line_tool['requirements']['DockerRequirement']['dockerPull'] = dockerurl

# Forward the ordinary argument parameters to the process step directly
self._insert_argument_params()
Expand All @@ -131,6 +230,9 @@ def generate_process_cwl(self, outdir, dockerurl):
self._insert_input_params()
self._insert_output_params()

# Add metadata on the source repository
self._insert_metadata()

fname = os.path.join(outdir, 'process.cwl')
write_cwl_file(fname, self.process_cwl)
return fname
Expand Down
27 changes: 25 additions & 2 deletions app_pack_generator/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,16 @@ def image_reference(self):
else:
return f"{self.image_repository}:{self.image_tag}"

def image_source(self):
source = self.git_mgr.source_location

# If the source is from Github thenuse the parsed owner
# and repo name to properly set up package linking
if "github.com" in source:
source = f"https://github.com/{self.git_mgr.owner}/{self.git_mgr.name}"

return source

def repo2docker(self):
"""Calls repo2docker on the local git directory to generate the Docker image.

Expand All @@ -96,8 +106,21 @@ def repo2docker(self):
logger.info(f"Building Docker image named {self.image_reference}")

# Build initial repo2docker command line arguments
cmd = ['jupyter-repo2docker', '--user-id', '1000', '--user-name', 'jovyan',
'--no-run', '--debug', '--image-name', self.image_reference]
# Do not supply the --user-id argument as it will cause
# permission issues when the CWL is run using
# cwltool with the --no-match-user argument
cmd = ['jupyter-repo2docker',
'--user-name', 'jovyan',
'--no-run', '--debug',
'--image-name', self.image_reference]

# Add repo source when appropriate
# Used by GHCR to connect a package to a repo
# https://docs.github.com/en/packages/learn-github-packages/connecting-a-repository-to-a-package#connecting-a-repository-to-a-container-image-using-the-command-line
source = self.image_source()
if source is not None:
cmd += [ '--label',
f'org.opencontainers.image.source={source}' ]

if self.repo_config is not None:
# If the repo2docker config file does not exist inside the repo already, assume it is a URL
Expand Down
Loading