Skip to content

Commit a2ab158

Browse files
authored
Merge pull request #47 from CESNET/dev
Update main branch to state deployed on qa2 - Added aio_pika module for sending / recieving messages asynchronously over RabbitMQ - Fixed singularity volumes - Increased timeout for job completion - Made tests compatible with singularity container type
2 parents 9292d9b + 066cadc commit a2ab158

14 files changed

Lines changed: 307 additions & 104 deletions

File tree

.github/workflows/ci.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ jobs:
1313

1414
- name: check out repository
1515
uses: actions/checkout@v4
16+
17+
- name: Set up Python
18+
uses: actions/setup-python@v4
19+
with:
20+
python-version: 3.11
1621

1722
- name: Set up Docker Buildx
1823
uses: docker/setup-buildx-action@v2
@@ -31,6 +36,9 @@ jobs:
3136
run: |
3237
python tests/upload_server.py &
3338
39+
- name: Install dependencies
40+
run: pip install .
41+
3442
- name: instal pytest
3543
run: pip install pytest
3644

docker-compose.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ services:
66
target: development
77
image: tesp-api
88
environment:
9-
- CONTAINER_TYPE=docker # Set to "docker", "singularity", or "both"
9+
- CONTAINER_TYPE=singularity # Set to "docker", "singularity", or "both"
1010
container_name: tesp-api
1111
privileged: true
1212
ports:

docker/pulsar_rest/Dockerfile

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,28 @@ RUN pip install 'pulsar-app[web]'
2626
FROM python-base as development
2727
COPY --from=builder $PYSETUP_PATH $PYSETUP_PATH
2828

29+
# Install dependencies required by Apptainer (Singularity)
30+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
31+
curl \
32+
gnupg-agent \
33+
software-properties-common \
34+
lsb-release \
35+
wget \
36+
libseccomp2 \
37+
uidmap \
38+
squashfs-tools \
39+
squashfuse \
40+
fuse2fs \
41+
fuse-overlayfs \
42+
fakeroot \
43+
cryptsetup
44+
45+
# Download and install Apptainer
46+
ARG APPTAINER_VERSION=1.3.6
47+
RUN curl -LO https://github.com/apptainer/apptainer/releases/download/v${APPTAINER_VERSION}/apptainer_${APPTAINER_VERSION}_amd64.deb \
48+
&& apt-get install -y ./apptainer_${APPTAINER_VERSION}_amd64.deb \
49+
&& rm apptainer_${APPTAINER_VERSION}_amd64.deb
50+
2951
RUN apt-get update && apt-get install -y curl gnupg-agent software-properties-common lsb-release
3052
RUN curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add -
3153
RUN add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable"
@@ -37,4 +59,4 @@ WORKDIR $PYSETUP_PATH
3759
COPY startup.sh startup.sh
3860
RUN pulsar-config --host 0.0.0.0
3961
EXPOSE 8913
40-
CMD ["/bin/bash", "./startup.sh"]
62+
CMD ["/bin/bash", "./startup.sh"]

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ repository = "https://github.com/ndopj/tesp-api"
1010

1111
[tool.poetry.dependencies]
1212
python = "^3.10.0"
13+
aio_pika = "^9.5.7"
1314
fastapi = "^0.75.1"
1415
orjson = "^3.6.8"
1516
gunicorn = "^20.1.0"

settings.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
db.mongodb_uri = "mongodb://localhost:27017"
33
pulsar.url = "http://localhost:8913"
44
pulsar.status.poll_interval = 4
5-
pulsar.status.max_polls = 100
5+
pulsar.status.max_polls = 400
6+
pulsar.client_timeout = 30
67

78
logging.level = "DEBUG"
89
logging.output_json = false

tesp_api/api/endpoints/endpoint_utils.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,5 +107,14 @@ def resource_not_found_response(message: Maybe[str] = Nothing):
107107

108108

109109
def response_from_model(model: BaseModel, model_rules: dict = None) -> Response:
110-
return Response(model.json(**(model_rules if model_rules else {}), by_alias=False),
111-
status_code=200, media_type='application/json')
110+
response = Response(
111+
model.json(**(model_rules if model_rules else {}), by_alias=False),
112+
status_code=200,
113+
media_type='application/json'
114+
)
115+
# FORCE NO CACHING
116+
response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
117+
response.headers["Pragma"] = "no-cache"
118+
response.headers["Expires"] = "0"
119+
120+
return response

tesp_api/repository/task_repository.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -97,17 +97,20 @@ def cancel_task(
9797
p_author: Maybe[str],
9898
task_id: ObjectId
9999
) -> Promise:
100-
full_search_query = dict()
101-
full_search_query.update({'_id': task_id})
102-
full_search_query.update(p_author.maybe({}, lambda a: {'author': a}))
103-
104-
return Promise(lambda resolve, reject: resolve(full_search_query)) \
105-
.then(self._tasks.find_one) \
106-
.then(lambda _task: self.update_task(
107-
{'_id': task_id},
108-
{'$set': {'state': TesTaskState.CANCELED}}
109-
)).map(lambda updated_task: updated_task
110-
.map(lambda _updated_task: _updated_task.id))\
100+
search_query = {
101+
'_id': task_id,
102+
'state': {'$in': [
103+
TesTaskState.QUEUED,
104+
TesTaskState.INITIALIZING,
105+
TesTaskState.RUNNING
106+
]}
107+
}
108+
search_query.update(p_author.maybe({}, lambda a: {'author': a}))
109+
update_query = {'$set': {'state': TesTaskState.CANCELED}}
110+
111+
return self.update_task(search_query, update_query)\
112+
.map(lambda updated_task: updated_task
113+
.map(lambda _updated_task: _updated_task.id))\
111114
.catch(handle_data_layer_error)
112115

113116

tesp_api/service/event_actions.py

Lines changed: 82 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from tesp_api.repository.task_repository import task_repository
1717
from tesp_api.service.file_transfer_service import file_transfer_service
1818
from tesp_api.service.error import pulsar_event_handle_error, TaskNotFoundError, TaskExecutorError
19-
from tesp_api.service.pulsar_operations import PulsarRestOperations, PulsarAmpqOperations, DataType
19+
from tesp_api.service.pulsar_operations import PulsarRestOperations, PulsarAmqpOperations, DataType
2020
from tesp_api.repository.model.task import (
2121
TesTaskState,
2222
TesTaskExecutor,
@@ -29,6 +29,7 @@
2929

3030
CONTAINER_TYPE = os.getenv("CONTAINER_TYPE", "docker")
3131

32+
3233
@local_handler.register(event_name="queued_task")
3334
def handle_queued_task(event: Event) -> None:
3435
"""
@@ -39,8 +40,9 @@ def handle_queued_task(event: Event) -> None:
3940
match pulsar_service.get_operations():
4041
case PulsarRestOperations() as pulsar_rest_operations:
4142
dispatch_event('queued_task_rest', {**payload, 'pulsar_operations': pulsar_rest_operations})
42-
case PulsarAmpqOperations() as pulsar_ampq_operations:
43-
dispatch_event('queued_task_ampq', {**payload, 'pulsar_operations': pulsar_ampq_operations})
43+
case PulsarAmqpOperations() as pulsar_amqp_operations:
44+
dispatch_event('queued_task_amqp', {**payload, 'pulsar_operations': pulsar_amqp_operations})
45+
4446

4547
@local_handler.register(event_name="queued_task_rest")
4648
async def handle_queued_task_rest(event: Event):
@@ -53,12 +55,37 @@ async def handle_queued_task_rest(event: Event):
5355

5456
print(f"Queued task rest: {task_id}")
5557

56-
await Promise(lambda resolve, reject: resolve(None))\
57-
.then(lambda nothing: pulsar_operations.setup_job(task_id))\
58-
.map(lambda setup_job_result: dispatch_event('initialize_task', {**payload, 'task_config': setup_job_result}))\
59-
.catch(lambda error: pulsar_event_handle_error(error, task_id, event_name, pulsar_operations))\
58+
await Promise(lambda resolve, reject: resolve(None)) \
59+
.then(lambda nothing: pulsar_operations.setup_job(task_id)) \
60+
.map(lambda setup_job_result: dispatch_event('initialize_task', {**payload, 'task_config': setup_job_result})) \
61+
.catch(lambda error: pulsar_event_handle_error(error, task_id, event_name, pulsar_operations)) \
6062
.then(lambda x: x) # Invokes promise, potentially from error handler
6163

64+
65+
@local_handler.register(event_name="queued_task_amqp")
66+
async def handle_queued_task_amqp(event: Event):
67+
"""
68+
Sets up the job in Pulsar via AMQP operations and dispatches an 'initialize_task' event.
69+
"""
70+
event_name, payload = event
71+
task_id: ObjectId = payload['task_id']
72+
pulsar_operations: PulsarAmqpOperations = payload['pulsar_operations']
73+
74+
print(f"Queued task AMQP: {task_id}")
75+
76+
try:
77+
# Setup job via AMQP
78+
setup_job_result = await pulsar_operations.setup_job(task_id)
79+
80+
# Dispatch initialize event
81+
await dispatch_event('initialize_task', {
82+
**payload,
83+
'task_config': setup_job_result
84+
})
85+
except Exception as error:
86+
await pulsar_event_handle_error(error, task_id, event_name, pulsar_operations)
87+
88+
6289
@local_handler.register(event_name="initialize_task")
6390
async def handle_initializing_task(event: Event) -> None:
6491
"""
@@ -69,12 +96,11 @@ async def handle_initializing_task(event: Event) -> None:
6996
task_id: ObjectId = payload['task_id']
7097
pulsar_operations: PulsarRestOperations = payload['pulsar_operations']
7198

72-
# Merged Logic: Using the feature-complete setup_data from the new version
7399
async def setup_data(job_id: ObjectId,
74-
resources: TesTaskResources,
75-
volumes: List[str],
76-
inputs: List[TesTaskInput],
77-
outputs: List[TesTaskOutput]):
100+
resources: TesTaskResources,
101+
volumes: List[str],
102+
inputs: List[TesTaskInput],
103+
outputs: List[TesTaskOutput]):
78104
resource_conf: dict
79105
volume_confs: List[dict] = []
80106
input_confs: List[dict] = []
@@ -109,28 +135,29 @@ async def setup_data(job_id: ObjectId,
109135
return resource_conf, volume_confs, input_confs, output_confs
110136

111137
print(f"Initializing task: {task_id}")
112-
await Promise(lambda resolve, reject: resolve(None))\
138+
await Promise(lambda resolve, reject: resolve(None)) \
113139
.then(lambda nothing: task_repository.update_task_state(
114-
task_id,
115-
TesTaskState.QUEUED,
116-
TesTaskState.INITIALIZING
117-
)).map(lambda updated_task: get_else_throw(
118-
updated_task, TaskNotFoundError(task_id, Just(TesTaskState.QUEUED))
119-
)).then(lambda updated_task: setup_data(
120-
task_id,
121-
maybe_of(updated_task.resources).maybe(None, lambda x: x),
122-
maybe_of(updated_task.volumes).maybe([], lambda x: x),
123-
maybe_of(updated_task.inputs).maybe([], lambda x: x),
124-
maybe_of(updated_task.outputs).maybe([], lambda x: x)
125-
)).map(lambda res_input_output_confs: dispatch_event('run_task', {
126-
**payload,
127-
'resource_conf': res_input_output_confs[0],
128-
'volume_confs': res_input_output_confs[1],
129-
'input_confs': res_input_output_confs[2],
130-
'output_confs': res_input_output_confs[3]
131-
})).catch(lambda error: pulsar_event_handle_error(error, task_id, event_name, pulsar_operations))\
140+
task_id,
141+
TesTaskState.QUEUED,
142+
TesTaskState.INITIALIZING
143+
)).map(lambda updated_task: get_else_throw(
144+
updated_task, TaskNotFoundError(task_id, Just(TesTaskState.QUEUED))
145+
)).then(lambda updated_task: setup_data(
146+
task_id,
147+
maybe_of(updated_task.resources).maybe(None, lambda x: x),
148+
maybe_of(updated_task.volumes).maybe([], lambda x: x),
149+
maybe_of(updated_task.inputs).maybe([], lambda x: x),
150+
maybe_of(updated_task.outputs).maybe([], lambda x: x)
151+
)).map(lambda res_input_output_confs: dispatch_event('run_task', {
152+
**payload,
153+
'resource_conf': res_input_output_confs[0],
154+
'volume_confs': res_input_output_confs[1],
155+
'input_confs': res_input_output_confs[2],
156+
'output_confs': res_input_output_confs[3]
157+
})).catch(lambda error: pulsar_event_handle_error(error, task_id, event_name, pulsar_operations)) \
132158
.then(lambda x: x)
133159

160+
134161
@local_handler.register(event_name="run_task")
135162
async def handle_run_task(event: Event) -> None:
136163
"""
@@ -146,8 +173,8 @@ async def handle_run_task(event: Event) -> None:
146173
input_confs: List[dict] = payload['input_confs']
147174
output_confs: List[dict] = payload['output_confs']
148175
pulsar_operations: PulsarRestOperations = payload['pulsar_operations']
149-
150-
run_command_str = None
176+
177+
run_command_str = None
151178
command_start_time = datetime.datetime.now(datetime.timezone.utc)
152179

153180
try:
@@ -175,7 +202,7 @@ async def handle_run_task(event: Event) -> None:
175202
)
176203

177204
stage_exec = TesTaskExecutor(image="willdockerhub/curl-wget:latest", command=[], workdir=Path("/downloads"))
178-
205+
179206
# Stage-in command
180207
stage_in_cmd = ""
181208
stage_in_mount = ""
@@ -211,7 +238,6 @@ async def handle_run_task(event: Event) -> None:
211238
non_empty_parts = [p.strip() for p in parts if p and p.strip()]
212239
run_command_str = " && ".join(non_empty_parts) if non_empty_parts else None
213240

214-
# Resume with the polished version's logic for execution and state management
215241
command_start_time = datetime.datetime.now(datetime.timezone.utc)
216242
command_status: dict
217243

@@ -231,27 +257,28 @@ async def handle_run_task(event: Event) -> None:
231257
command_status.get('returncode', -1)
232258
)
233259

234-
current_task_monad = await task_repository.get_task(maybe_of(author), {'_id': task_id})
235-
current_task_obj = get_else_throw(current_task_monad, TaskNotFoundError(task_id))
260+
current_task_monad = await task_repository.get_task(maybe_of(author), {'_id': task_id})
261+
current_task_obj = get_else_throw(current_task_monad, TaskNotFoundError(task_id))
236262

237263
if current_task_obj.state == TesTaskState.CANCELED:
238264
print(f"Task {task_id} found CANCELED after job completion polling. Aborting state changes.")
239-
return
265+
return
240266

241267
if command_status.get('returncode', -1) != 0:
242-
print(f"Task {task_id} executor error (return code: {command_status.get('returncode', -1)}). Setting state to EXECUTOR_ERROR.")
268+
print(
269+
f"Task {task_id} executor error (return code: {command_status.get('returncode', -1)}). Setting state to EXECUTOR_ERROR.")
243270
await task_repository.update_task_state(task_id, TesTaskState.RUNNING, TesTaskState.EXECUTOR_ERROR)
244271
await pulsar_operations.erase_job(task_id)
245-
return
272+
return
246273

247274
print(f"Task {task_id} completed successfully. Setting state to COMPLETE.")
248275
await Promise(lambda resolve, reject: resolve(None)) \
249276
.then(lambda ignored: task_repository.update_task_state(
250-
task_id, TesTaskState.RUNNING, TesTaskState.COMPLETE
251-
)) \
277+
task_id, TesTaskState.RUNNING, TesTaskState.COMPLETE
278+
)) \
252279
.map(lambda task_after_complete_update: get_else_throw(
253-
task_after_complete_update, TaskNotFoundError(task_id, Just(TesTaskState.RUNNING))
254-
)) \
280+
task_after_complete_update, TaskNotFoundError(task_id, Just(TesTaskState.RUNNING))
281+
)) \
255282
.then(lambda ignored: pulsar_operations.erase_job(task_id)) \
256283
.catch(lambda error: pulsar_event_handle_error(error, task_id, event_name, pulsar_operations)) \
257284
.then(lambda x: x)
@@ -262,22 +289,24 @@ async def handle_run_task(event: Event) -> None:
262289
await pulsar_operations.kill_job(task_id)
263290
await pulsar_operations.erase_job(task_id)
264291
print(f"Task {task_id} Pulsar job cleanup attempted after asyncio cancellation.")
265-
292+
266293
except Exception as error:
267294
print(f"Exception in handle_run_task for task {task_id}: {type(error).__name__} - {error}")
268295

269296
task_state_after_error_monad = await task_repository.get_task(maybe_of(author), {'_id': task_id})
270297
if task_state_after_error_monad.is_just() and task_state_after_error_monad.value.state == TesTaskState.CANCELED:
271-
print(f"Task {task_id} is already CANCELED. Exception '{type(error).__name__}' likely due to this. No further error processing by handler.")
272-
return
298+
print(
299+
f"Task {task_id} is already CANCELED. Exception '{type(error).__name__}' likely due to this. No further error processing by handler.")
300+
return
273301

274302
print(f"Task {task_id} not CANCELED; proceeding with pulsar_event_handle_error for '{type(error).__name__}'.")
275303
error_handler_result = pulsar_event_handle_error(error, task_id, event_name, pulsar_operations)
276304
if asyncio.iscoroutine(error_handler_result) or isinstance(error_handler_result, _Promise):
277305
await error_handler_result
278-
279-
try:
280-
print(f"Ensuring Pulsar job for task {task_id} is erased after general error handling in run_task.")
281-
await pulsar_operations.erase_job(task_id)
282-
except Exception as final_erase_error:
283-
print(f"Error during final Pulsar erase attempt for task {task_id} after general error: {final_erase_error}")
306+
307+
# try:
308+
# print(f"Ensuring Pulsar job for task {task_id} is erased after general error handling in run_task.")
309+
# await pulsar_operations.erase_job(task_id)
310+
# except Exception as final_erase_error:
311+
# print(
312+
# f"Error during final Pulsar erase attempt for task {task_id} after general error: {final_erase_error}")

0 commit comments

Comments
 (0)