Skip to content

Commit ef745dd

Browse files
committed
Add support for appsetup script to be a local file
Current support for appsetup script relies on URL, which the task setup executes a curl to obtain the file. By supporting local file, one does not require to upload a file to a server (e.g. git).
1 parent 53a4b33 commit ef745dd

File tree

7 files changed

+187
-60
lines changed

7 files changed

+187
-60
lines changed

poetry.lock

Lines changed: 49 additions & 43 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ azure-mgmt-resource = "^23.0.1"
2121
matplotlib = "^3.8.2"
2222
pytest = "^7.4.4"
2323
azure-mgmt-netapp = "^13.2.0"
24+
azure-storage-blob = "^12.25.1"
25+
azure-mgmt-storage = "^23.0.0"
2426

2527

2628
[build-system]

src/hpcadvisor/__main__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def collect_handler(args):
5151
cleartasks = args.cleartasks
5252
keeppools = args.keeppools
5353
reusepools = args.reusepools
54+
keepjobs = args.keepjobs
5455

5556
from hpcadvisor import main_cli, utils
5657
from hpcadvisor.task_selection_policy import get_policy_class
@@ -70,6 +71,7 @@ def collect_handler(args):
7071
"cleartasks": cleartasks,
7172
"keeppools": keeppools,
7273
"reusepools": reusepools,
74+
"keepjobs": keepjobs,
7375
"policy": policy,
7476
}
7577

@@ -173,6 +175,10 @@ def _process_arguments():
173175
collect.add_argument(
174176
"-rp", "--reusepools", help="Reuse pools", required=False, action="store_true"
175177
)
178+
collect.add_argument(
179+
"-kj", "--keepjobs", help="Keep jobs", required=False, action="store_true"
180+
)
181+
176182
collect.set_defaults(func=collect_handler)
177183

178184
plot = subparsers.add_parser("plot", help="Plot generator help")

src/hpcadvisor/batch_handler.py

Lines changed: 92 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
import sys
77
import time
88
from datetime import timedelta
9+
import re
10+
from urllib.parse import urlparse
11+
912

1013
import azure.batch.models as batchmodels
1114
import numpy as np
@@ -23,11 +26,15 @@
2326
VirtualMachine,
2427
VirtualMachineImage,
2528
)
29+
30+
from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions
2631
from azure.mgmt.monitor import MonitorManagementClient
2732
from azure.mgmt.netapp import NetAppManagementClient
2833
from azure.mgmt.netapp.models import NetAppAccount
2934
from azure.mgmt.network import NetworkManagementClient
3035
from azure.mgmt.resource import ResourceManagementClient, SubscriptionClient
36+
from azure.mgmt.storage import StorageManagementClient
37+
from azure.batch.models import ResourceFile
3138

3239
from hpcadvisor import dataset_handler, logger, taskset_handler, utils
3340
from hpcadvisor.azure_identity_credential_adapter import AzureIdentityCredentialAdapter
@@ -123,7 +130,7 @@ def wait_pool_ready(poolid):
123130
def verify_pool():
124131
pool_config = batch_client.pool.get(poolid)
125132
if pool_config.resize_errors and len(pool_config.resize_errors) > 0:
126-
log.error(f"Pool {pool_id} resize failed.")
133+
log.error(f"Pool {poolid} resize failed.")
127134
msg = "\n".join(
128135
[f" {e.code}: {e.message}" for e in pool_config.resize_errors]
129136
)
@@ -564,7 +571,7 @@ def create_pool(sku, number_of_nodes):
564571
anfmountdir = env["ANFMOUNTDIR"]
565572
anfvolume = env["ANFVOLUMENAME"]
566573
script = f"""
567-
sleep 60
574+
sleep 120
568575
sudo mkdir {anfmountdir} ; mount {anf_ip}:/{anfvolume} {anfmountdir}
569576
sudo chown _azbatch:_azbatchgrp {anfmountdir}
570577
sudo df -Tha
@@ -656,6 +663,61 @@ def create_pool(sku, number_of_nodes):
656663

657664
return poolname
658665

666+
def is_url(input_string):
667+
668+
is_url_bool = False
669+
parsed = urlparse(input_string)
670+
671+
if parsed.scheme in ("http", "https") and parsed.netloc:
672+
is_url_bool = True
673+
674+
domain_pattern = re.compile(r"^[\w.-]+\.[a-z]{2,}(?:/|$)")
675+
if domain_pattern.match(input_string):
676+
is_url_bool = True
677+
678+
if os.path.exists(input_string):
679+
is_url_bool = False
680+
681+
log.debug(f"Input string '{input_string}' is URL: {is_url_bool}")
682+
return is_url_bool
683+
684+
def get_blob_storage_key(subscription_id, resource_group, storage_account_name):
685+
credential = DefaultAzureCredential()
686+
storage_client = StorageManagementClient(credential, subscription_id)
687+
keys = storage_client.storage_accounts.list_keys(resource_group, storage_account_name)
688+
return keys.keys[0].value # Use the first key
689+
690+
def upload_file_to_blob(local_file_path, blob_storage_account, blob_storage_key, blob_container):
691+
692+
log.info(f"Uploading file {local_file_path} to blob storage")
693+
blob_storage_key = get_blob_storage_key(
694+
env["SUBSCRIPTION"],
695+
env["RG"],
696+
blob_storage_account
697+
)
698+
699+
log.debug(f"blob_storage_account={blob_storage_account}, blob_container={blob_container}, blob_storage_key={blob_storage_key}")
700+
701+
script_name = os.path.basename(local_file_path)
702+
blob_service_client = BlobServiceClient(
703+
f"https://{blob_storage_account}.blob.core.windows.net", credential=blob_storage_key
704+
)
705+
blob_client = blob_service_client.get_blob_client(container=blob_container, blob=script_name)
706+
with open(local_file_path, "rb") as data:
707+
blob_client.upload_blob(data, overwrite=True)
708+
709+
sas_token = generate_blob_sas(
710+
account_name=blob_storage_account,
711+
container_name=blob_container,
712+
blob_name=script_name,
713+
account_key=blob_storage_key,
714+
permission=BlobSasPermissions(read=True),
715+
expiry=datetime.datetime.now(datetime.timezone.utc) + timedelta(hours=2)
716+
)
717+
718+
log.info(f"File {script_name} uploaded to blob storage: {blob_client.url}")
719+
log.debug(f"SAS token: {sas_token}")
720+
return f"{blob_client.url}?{sas_token}", script_name
659721

660722
def create_setup_task(jobid, appsetupurl):
661723
log.info(f"Creating setup task for jobid={jobid}")
@@ -664,8 +726,7 @@ def create_setup_task(jobid, appsetupurl):
664726
log.critical("batch_client is None")
665727
return
666728

667-
app_setup_url = appsetupurl
668-
log.debug(f"application setup url: {app_setup_url}")
729+
log.debug(f"application setup url: {appsetupurl}")
669730

670731
random_code = utils.get_random_code()
671732
task_id = f"task-app-setup-{random_code}"
@@ -674,18 +735,37 @@ def create_setup_task(jobid, appsetupurl):
674735
log.critical(f"jobid is None and cannot create setup task {task_id}")
675736
return
676737

677-
script_name = os.path.basename(app_setup_url)
738+
blob_storage_account = env["BLOBSTORAGEACCOUNT"]
739+
blob_storage_key = env["BLOBSTORAGEKEY"]
740+
blob_container = env["BLOBCONTAINER"]
741+
742+
743+
appsetup_is_url = is_url(appsetupurl)
744+
if not appsetup_is_url:
745+
log.debug(f"appsetupurl is not a URL: {appsetupurl}")
746+
app_setup_url, script_name = upload_file_to_blob(appsetupurl, blob_storage_account, blob_storage_key, blob_container)
747+
resource_files = [ResourceFile(http_url=app_setup_url, file_path=script_name)]
748+
else:
749+
script_name = os.path.basename(appsetupurl)
750+
resource_files = []
751+
678752
log.debug(f"script for application: {script_name}")
679753

680754
anfmountdir = env["ANFMOUNTDIR"]
681-
755+
682756
if anfenabled:
683-
task_commands = [
684-
f"/bin/bash -c 'set ; cd {anfmountdir} ; curl -sLO {app_setup_url} ; source {script_name} ; {HPCADVISOR_FUNCTION_SETUP}'"
685-
]
757+
if not appsetup_is_url:
758+
task_commands = [
759+
f"/bin/bash -c 'set ; cp {script_name} {anfmountdir} ; cd {anfmountdir} ; sudo chown _azbatch:_azbatchgrp {script_name} ; source {script_name} ; {HPCADVISOR_FUNCTION_SETUP}'"
760+
]
761+
else:
762+
task_commands = [
763+
f"/bin/bash -c 'set ; cd {anfmountdir} ; curl -sLO {appsetupurl} ; sudo chown _azbatch:_azbatchgrp {script_name} ; source {script_name} ; {HPCADVISOR_FUNCTION_SETUP}'"
764+
]
686765
else:
766+
# TO FIX url versus file
687767
task_commands = [
688-
f"/bin/bash -c 'set ; cd $AZ_BATCH_NODE_MOUNTS_DIR/data ; curl -sLO {app_setup_url} ; source {script_name} ; {HPCADVISOR_FUNCTION_SETUP}'"
768+
f"/bin/bash -c 'set ; cd $AZ_BATCH_NODE_MOUNTS_DIR/data ; curl -sLO {appsetupurl} ; source {script_name} ; {HPCADVISOR_FUNCTION_SETUP}'"
689769
]
690770

691771
log.debug(f"task command: {task_commands}")
@@ -701,6 +781,7 @@ def create_setup_task(jobid, appsetupurl):
701781
id=task_id,
702782
user_identity=user,
703783
command_line=task_commands[0],
784+
resource_files=resource_files,
704785
)
705786

706787
batch_client.task.add(job_id=jobid, task=task)
@@ -793,7 +874,7 @@ def create_compute_task(
793874
anfmountdir = env["ANFMOUNTDIR"]
794875
if anfenabled:
795876
task_commands = [
796-
f"/bin/bash -c 'export HOSTLIST_PPN=$(paste -d, -s <(sed \"s/ slots=[0-9]\+/:{ppn}/\" $HOSTFILE_PATH)); set ; source {anfmountdir}/{app_run_script} ; cd {fulltaskrundir}; {HPCADVISOR_FUNCTION_RUN}'",
877+
f"/bin/bash -c 'export HOSTLIST_PPN=$(paste -d, -s <(sed \"s/ slots=[0-9]\+/:{ppn}/\" $HOSTFILE_PATH)); set ; whoami ; source {anfmountdir}/{app_run_script} ; pwd; ls -l ; cat {anfmountdir}/{app_run_script} ; cd {anfmountdir} ; pwd ; ls -l ; cd {fulltaskrundir}; {HPCADVISOR_FUNCTION_RUN}'",
797878
]
798879
else:
799880
task_commands = [

src/hpcadvisor/data_collector.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,9 @@ def process_task_completion(
183183

184184
taskset_handler.update_task_status(task["id"], tasks_file, task_status)
185185

186-
if not collector_config["keeppools"]:
186+
if "keeppools" not in collector_config or not collector_config["keeppools"]:
187187
batch_handler.delete_pool(poolname)
188-
if not collector_config["keepjobs"]:
188+
if "keepjobs" not in collector_config or not collector_config["keepjobs"]:
189189
batch_handler.delete_job(jobname)
190190

191191

0 commit comments

Comments
 (0)