Lumi-supercomputer · marlon-tobaben · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/02_Using_the_LUMI_web_interface/Clone_with_JupyterLab.md b/02_Using_the_LUMI_web_interface/Clone_with_JupyterLab.md
diff --git a/02_Using_the_LUMI_web_interface/GPT-neo-IMDB-introduction.ipynb b/02_Using_the_LUMI_web_interface/GPT-neo-IMDB-introduction.ipynb
@@ -39,7 +39,7 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "os.environ[\"HF_HOME\"] = \"/flash/project_465002178/hf-cache\""
+    "os.environ[\"HF_HOME\"] = \"/flash/project_465002757/hf-cache\""
    ]
   },
   {

diff --git a/02_Using_the_LUMI_web_interface/README.md b/02_Using_the_LUMI_web_interface/README.md
@@ -7,27 +7,26 @@
     In this exercise you will gain first experience with using the LUMI web interface to navigate files and directories on the LUMI supercomputer. You will also set up your own copy of the exercise repository on the system, so that you can work on them without interfering with the other course participants.
 
    1. Log in to the LUMI web interface: https://www.lumi.csc.fi
-   2. Create your own subdirectory in `/project/project_465002178/` and `/scratch/project_465002178/`. Use your username for the directory name. You can either
+   2. Create your own subdirectory in `/project/project_465002757/` and `/scratch/project_465002757/`. Use your username for the directory name. You can either
         - Use the built-in file explorer ("Home Directory"), or
         - Use the login node shell app in the webinterface
-   3. Clone the [exercise repository](https://github.com/Lumi-supercomputer/Getting_Started_with_AI_workshop) to your folder in `/project/project_465002178/<username>`. You can either
-        - use the login node shell app in the webinterface, or
-        - start a Jupyter lab job and use the Jupyter lab UI for cloning Git repositories, see [Clone_with_JupyterLab.md](./Clone_with_JupyterLab.md) for an illustrated step-by-step guide for this.
+   3. Clone the [exercise repository](https://github.com/Lumi-supercomputer/Getting_Started_with_AI_workshop) to your folder in `/project/project_465002757/<username>`. You can use the login node shell app in the webinterface for that.
    4. Get familiar with the exercise repository layout.
 
 2. Start an interactive Jupyter lab job and run inference with GPT-neo.
 
     In this exercise you will learn how to reserve resources for and start an interactive job to run a Jupyter notebook via the LUMI web interface. The notebook itself introduces you to our running example of finetuning a language model using PyTorch and the training libraries provided by Huggingface. In this exercise you will not do any training, but familiarise yourself a bit with the software and the base model.
 
     1. Start an interactive Jupyter session: Open the Jupyter app (! not "Jupyter for Courses" !) in the LUMI webinterface and set the following settings before pressing `Launch`
-        - Project: `project_465002178 (LUST Training ...)`
+        - Project: `project_465002757 (LUST Training ...)`
         - Reservation: Use the course reservation `AI_workshop_Day1` (there should only be one available option)
         - Partition: `small-g`
         - Number of CPU cores: `7`
         - Memory (GB): `16`
         - Time: `0:30:00`
         - Working directory: `/project/$PROJECT`
-        - Python: `pytorch (Via CSC stack, limited support available)`
+        - Python: `lumi-multitorch (PyTorch, LUMI AI Factory)`
+        - Module version: You can use the default here.
         - Virtual environment path: leave empty
     2. Wait for the session to start, then press `Connect to Jupyter`
 

diff --git a/02_Using_the_LUMI_web_interface/images/step0.png b/02_Using_the_LUMI_web_interface/images/step0.png
diff --git a/02_Using_the_LUMI_web_interface/images/step1.png b/02_Using_the_LUMI_web_interface/images/step1.png
diff --git a/02_Using_the_LUMI_web_interface/images/step2.png b/02_Using_the_LUMI_web_interface/images/step2.png
diff --git a/03_Your_first_AI_training_job_on_LUMI/GPT-neo-IMDB-finetuning.py b/03_Your_first_AI_training_job_on_LUMI/GPT-neo-IMDB-finetuning.py
@@ -73,13 +73,17 @@
     print("Loading model and tokenizer")
     start = time.time()
     tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)
-    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.pad_token_id = 50256 # adjusting tokenizer and model 
 
     # Load the actual base model from Hugging Face
     model = AutoModelForCausalLM.from_pretrained(pretrained_model)
+     # adjusting tokenizer and model
+    model.config.pad_token_id = 50256
+    model.generation_config.pad_token_id = 50256
     model.to(device)
     stop = time.time()
     print(f"Loading model and tokenizer took: {stop-start:.2f} seconds")
+    print ("\n" * 4)
 
     # #### Loading the IMDb data set
     #
@@ -99,6 +103,7 @@
     # Let's print one sample from the dataset.
     print("Sample from dataset")
     pprint(train_dataset[200])
+    print ("\n" * 4)
 
     # #### Setting up the training configuration
     train_batch_size = 32  # This just about fits into the VRAM of a single MI250x GCD with 16-bit floats
@@ -140,6 +145,7 @@
         print("Length of input_ids:", len(b["input_ids"]))
         break
     print("Length of dataset (tokenized)", len(train_dataset_tokenized))
+    print ("\n" * 4)
 
     # #### Training
     # We use the Hugging Face trainer instead of a manual training loop.
@@ -156,7 +162,7 @@
     trainer = Trainer(
         model=model,
         args=training_args,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=collator,
         train_dataset=train_dataset_tokenized,
         eval_dataset=validate_dataset_tokenized,
@@ -167,6 +173,7 @@
 
     print()
     print("Training done, you can find all the model checkpoints in", output_dir)
+    print ("\n" * 4)
 
     # #### Evaluating the finetuned model
     with torch.no_grad():

diff --git a/03_Your_first_AI_training_job_on_LUMI/README.md b/03_Your_first_AI_training_job_on_LUMI/README.md
@@ -39,7 +39,7 @@
         - `--model-name` (a name under which the model produced by the run will be stored; optional)
         - `--num-workers` (optional, is used to set the number of PyTorch dataloader processes)
 
-        Please set the paths to some destination of your choice within your `/scratch/project_465002178/<username>` directory.
+        Please set the paths to some destination of your choice within your `/scratch/project_465002757/<username>` directory.
 
         > **Tip**
         >

diff --git a/03_Your_first_AI_training_job_on_LUMI/reference_solution/GPT-neo-IMDB-finetuning.py b/03_Your_first_AI_training_job_on_LUMI/reference_solution/GPT-neo-IMDB-finetuning.py
@@ -73,13 +73,17 @@
     print("Loading model and tokenizer")
     start = time.time()
     tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)
-    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.pad_token_id = 50256 # adjusting tokenizer and model 
 
     # Load the actual base model from Hugging Face
     model = AutoModelForCausalLM.from_pretrained(pretrained_model)
+     # adjusting tokenizer and model
+    model.config.pad_token_id = 50256
+    model.generation_config.pad_token_id = 50256
     model.to(device)
     stop = time.time()
     print(f"Loading model and tokenizer took: {stop-start:.2f} seconds")
+    print ("\n" * 4)
 
     # #### Loading the IMDb data set
     #
@@ -99,6 +103,7 @@
     # Let's print one sample from the dataset.
     print("Sample from dataset")
     pprint(train_dataset[200])
+    print ("\n" * 4)
 
     # #### Setting up the training configuration
     train_batch_size = 32  # This just about fits into the VRAM of a single MI250x GCD with 16-bit floats
@@ -140,6 +145,7 @@
         print("Length of input_ids:", len(b["input_ids"]))
         break
     print("Length of dataset (tokenized)", len(train_dataset_tokenized))
+    print ("\n" * 4)
 
     # #### Training
     # We use the Hugging Face trainer instead of a manual training loop.
@@ -156,7 +162,7 @@
     trainer = Trainer(
         model=model,
         args=training_args,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=collator,
         train_dataset=train_dataset_tokenized,
         eval_dataset=validate_dataset_tokenized,
@@ -167,6 +173,7 @@
 
     print()
     print("Training done, you can find all the model checkpoints in", output_dir)
+    print ("\n" * 4)
 
     # #### Evaluating the finetuned model
     with torch.no_grad():

diff --git a/...training_job_on_LUMI/reference_solution/resume_from_checkpoint/GPT-neo-IMDB-finetuning.py b/...training_job_on_LUMI/reference_solution/resume_from_checkpoint/GPT-neo-IMDB-finetuning.py
@@ -79,13 +79,17 @@
     print("Loading model and tokenizer")
     start = time.time()
     tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)
-    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.pad_token_id = 50256 # adjusting tokenizer and model 
 
     # Load the actual base model from Hugging Face
     model = AutoModelForCausalLM.from_pretrained(pretrained_model)
+     # adjusting tokenizer and model
+    model.config.pad_token_id = 50256
+    model.generation_config.pad_token_id = 50256
     model.to(device)
     stop = time.time()
     print(f"Loading model and tokenizer took: {stop-start:.2f} seconds")
+    print ("\n" * 4)
 
     # #### Loading the IMDb data set
     #
@@ -105,6 +109,7 @@
     # Let's print one sample from the dataset.
     print("Sample from dataset")
     pprint(train_dataset[200])
+    print ("\n" * 4)
 
     # #### Setting up the training configuration
     train_batch_size = 32  # This just about fits into the VRAM of a single MI250x GCD with 16-bit floats
@@ -147,6 +152,7 @@
         print("Length of input_ids:", len(b["input_ids"]))
         break
     print("Length of dataset (tokenized)", len(train_dataset_tokenized))
+    print ("\n" * 4)
 
     # #### Training
     # We use the Hugging Face trainer instead of a manual training loop.
@@ -163,7 +169,7 @@
     trainer = Trainer(
         model=model,
         args=training_args,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=collator,
         train_dataset=train_dataset_tokenized,
         eval_dataset=validate_dataset_tokenized,
@@ -174,6 +180,7 @@
 
     print()
     print("Training done, you can find all the model checkpoints in", output_dir)
+    print ("\n" * 4)
 
     # #### Evaluating the finetuned model
     with torch.no_grad():

diff --git a/03_Your_first_AI_training_job_on_LUMI/reference_solution/resume_from_checkpoint/run.sh b/03_Your_first_AI_training_job_on_LUMI/reference_solution/resume_from_checkpoint/run.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-#SBATCH --account=project_465002178
+#SBATCH --account=project_465002757
 #SBATCH --reservation=AI_workshop_Day1   # comment this out if the reservation is no longer available
 #SBATCH --partition=small-g
 #SBATCH --gpus-per-node=1
@@ -10,14 +10,14 @@
 
 # Set up the software environment
 # NOTE: the loaded module makes relevant filesystem locations available inside the singularity container
-#   (/scratch, /project, etc) as well as mounts some important system libraries that are optimized for LUMI
+#   (/scratch, /project, etc)
 # If you are interested, you can check the exact paths being mounted from
-#   /appl/local/containers/ai-modules/singularity-AI-bindings/24.03.lua
+#   /appl/local/laifs/modules/lumi-aif-singularity-bindings/1.0.1.lua
 module purge
-module use /appl/local/containers/ai-modules
-module load singularity-AI-bindings
+module use /appl/local/laifs/modules
+module load lumi-aif-singularity-bindings
 
-CONTAINER=/appl/local/containers/sif-images/lumi-pytorch-rocm-6.2.4-python-3.12-pytorch-v2.6.0.sif
+CONTAINER=/appl/local/laifs/containers/lumi-multitorch-u24r70f21m50t210-20260513_121430/lumi-multitorch-full-u24r70f21m50t210-20260513_121430.sif
 
 # Some environment variables to set up cache directories
 SCRATCH="/scratch/${SLURM_JOB_ACCOUNT}"
@@ -35,7 +35,7 @@ export OUTPUT_DIR=$SCRATCH/$USER/data/
 export LOGGING_DIR=$SCRATCH/$USER/runs/
 
 set -xv # print the command so that we can verify setting arguments correctly from the logs
-srun singularity exec $CONTAINER \
+srun singularity run $CONTAINER \
     python GPT-neo-IMDB-finetuning.py \
         --model-name gpt-imdb-model \
         --output-path $OUTPUT_DIR \

diff --git a/03_Your_first_AI_training_job_on_LUMI/reference_solution/run.sh b/03_Your_first_AI_training_job_on_LUMI/reference_solution/run.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-#SBATCH --account=project_465002178
+#SBATCH --account=project_465002757
 #SBATCH --reservation=AI_workshop_Day1   # comment this out if the reservation is no longer available
 #SBATCH --partition=small-g
 #SBATCH --gpus-per-node=1
@@ -10,14 +10,14 @@
 
 # Set up the software environment
 # NOTE: the loaded module makes relevant filesystem locations available inside the singularity container
-#   (/scratch, /project, etc) as well as mounts some important system libraries that are optimized for LUMI
+#   (/scratch, /project, etc)
 # If you are interested, you can check the exact paths being mounted from
-#   /appl/local/containers/ai-modules/singularity-AI-bindings/24.03.lua
+#   /appl/local/laifs/modules/lumi-aif-singularity-bindings/1.0.1.lua
 module purge
-module use /appl/local/containers/ai-modules
-module load singularity-AI-bindings
+module use /appl/local/laifs/modules
+module load lumi-aif-singularity-bindings
 
-CONTAINER=/appl/local/containers/sif-images/lumi-pytorch-rocm-6.2.4-python-3.12-pytorch-v2.6.0.sif
+CONTAINER=/appl/local/laifs/containers/lumi-multitorch-u24r70f21m50t210-20260513_121430/lumi-multitorch-full-u24r70f21m50t210-20260513_121430.sif
 
 # Some environment variables to set up cache directories
 SCRATCH="/scratch/${SLURM_JOB_ACCOUNT}"
@@ -35,7 +35,7 @@ export OUTPUT_DIR=$SCRATCH/$USER/data/
 export LOGGING_DIR=$SCRATCH/$USER/runs/
 
 set -xv # print the command so that we can verify setting arguments correctly from the logs
-srun singularity exec $CONTAINER \
+srun singularity run $CONTAINER \
     python GPT-neo-IMDB-finetuning.py \
         --model-name gpt-imdb-model \
         --output-path $OUTPUT_DIR \

diff --git a/03_Your_first_AI_training_job_on_LUMI/run.sh b/03_Your_first_AI_training_job_on_LUMI/run.sh
@@ -1,19 +1,19 @@
 #!/bin/bash
-#SBATCH --account=project_465002178
+#SBATCH --account=project_465002757
 #SBATCH --reservation=AI_workshop_Day1   # comment this out if the reservation is no longer available
 #SBATCH --partition=...
 ## <!!! ACTION REQUIRED: SPECIFY ADDITIONAL SLURM PARAMETERS HERE!!!>
 
 # Set up the software environment
 # NOTE: the loaded module makes relevant filesystem locations available inside the singularity container
-#   (/scratch, /project, etc) as well as mounts some important system libraries that are optimized for LUMI
+#   (/scratch, /project, etc)
 # If you are interested, you can check the exact paths being mounted from
-#   /appl/local/containers/ai-modules/singularity-AI-bindings/24.03.lua
+#   /appl/local/laifs/modules/lumi-aif-singularity-bindings/1.0.1.lua
 module purge
-module use /appl/local/containers/ai-modules
-module load singularity-AI-bindings
+module use /appl/local/laifs/modules
+module load lumi-aif-singularity-bindings
 
-CONTAINER=/appl/local/containers/sif-images/lumi-pytorch-rocm-6.2.4-python-3.12-pytorch-v2.6.0.sif
+CONTAINER=/appl/local/laifs/containers/lumi-multitorch-u24r70f21m50t210-20260513_121430/lumi-multitorch-full-u24r70f21m50t210-20260513_121430.sif
 
 # Some environment variables to set up cache directories
 SCRATCH="/scratch/${SLURM_JOB_ACCOUNT}"

diff --git a/08_Scaling_to_multiple_GPUs/GPT-neo-IMDB-finetuning.py b/08_Scaling_to_multiple_GPUs/GPT-neo-IMDB-finetuning.py
@@ -75,13 +75,17 @@
     print("Loading model and tokenizer")
     start = time.time()
     tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)
-    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.pad_token_id = 50256 # adjusting tokenizer and model 
 
     # Load the actual base model from Hugging Face
     model = AutoModelForCausalLM.from_pretrained(pretrained_model)
+     # adjusting tokenizer and model
+    model.config.pad_token_id = 50256
+    model.generation_config.pad_token_id = 50256
     model.to(device)
     stop = time.time()
     print(f"Loading model and tokenizer took: {stop-start:.2f} seconds")
+    print ("\n" * 4)
 
     # #### Loading the IMDb data set
     #
@@ -101,6 +105,7 @@
     # Let's print one sample from the dataset.
     print("Sample from dataset")
     pprint(train_dataset[200])
+    print ("\n" * 4)
 
     # #### Setting up the training configuration
     # <!!! ACTION REQUIRED: ADJUST THIS SO THAT EACH PROCESS ONLY HANDLES A SHARE OF THE TOTAL BATCH SIZE !!!>
@@ -143,6 +148,7 @@
         print("Length of input_ids:", len(b["input_ids"]))
         break
     print("Length of dataset (tokenized)", len(train_dataset_tokenized))
+    print ("\n" * 4)
 
     # #### Training
     # We use the Hugging Face trainer instead of a manual training loop.
@@ -155,7 +161,7 @@
     trainer = Trainer(
         model=model,
         args=training_args,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         data_collator=collator,
         train_dataset=train_dataset_tokenized,
         eval_dataset=validate_dataset_tokenized,
@@ -166,6 +172,7 @@
 
     print()
     print("Training done, you can find all the model checkpoints in", output_dir)
+    print ("\n" * 4)
 
     # #### Evaluating the finetuned model
     with torch.no_grad():