ml4sts · AymanBx · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026 · brownsarahm
diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py
@@ -212,7 +212,7 @@ def initialize_dir(self, no_git=False):
         # store tasks
         task_types = set([task.storage_type for task in self.tasks.values()])
         if 'csv' in task_types:
-            os.mkdir(self.bench_path,'tasks')
+            os.mkdir(os.path.join(self.bench_path,'tasks'))
             for task_name, task_object in self.tasks.items(): 
                 task_object.write(self.bench_path)
 

diff --git a/benchtools/cli.py b/benchtools/cli.py
@@ -136,7 +136,7 @@ def add_task(task_name, bench_path, task_source,task_type):
 @benchtool.command()
 @click.argument('benchmark-path', required = True, type=str)
 @click.argument('task_name', required = True)
-@click.option('-r', '--runner-type', type=click.Choice(['ollama', 'openai', 'aws']), 
+@click.option('-r', '--runner-type', type=click.Choice(['ollama', 'openai', 'bedrock']), 
                                                        default="ollama", help="The engine that will run your LLM.")
 @click.option('-m', '--model', type=str, default="gemma3",
                help="The LLM to be benchmarked.")
@@ -168,7 +168,7 @@ def run_task(benchmark_path: str, task_name, runner_type, model, api_url, log_pa
 
 @benchtool.command()
 @click.argument('benchmark-path', required = True, type=str)
-@click.option('-r', '--runner-type', type=click.Choice(['ollama', 'openai', 'aws']),
+@click.option('-r', '--runner-type', type=click.Choice(['ollama', 'openai', 'bedrock']),
                default="ollama", help="The engine that will run your LLM.")
 @click.option('-m', '--model', type=str, default="gemma3", 
               help="The LLM to be benchmarked.")

diff --git a/benchtools/task.py b/benchtools/task.py
@@ -1,7 +1,9 @@
 #  defines a class object for a task
 # from openai import OpenAI
 import os
-import yaml # requires pyyaml
+import yaml
+import json
+import boto3
 import pandas as pd
 from ollama import chat, ChatResponse, Client
 from benchtools.logger import init_log_folder, log_interaction
@@ -204,11 +206,18 @@ def generate_prompts(self):
         # TODO: consider if this could be a generator function if there are a lot of variants, to avoid memory issues. For now, we will assume that the number of variants is small enough to generate all prompts at once.
         if self.variant_values:
             id_prompt_list = []
-            for value_set in self.variant_values:
+
+            keys = self.variant_values.keys()
+
+            for i in range(len(list(self.variant_values.values())[0])):
+                single_dict={}
                 prompt = self.template
-                prompt = prompt.format(**value_set)
-                prompt_id = self.prompt_id_generator(self.task_id,value_set)
+                for key in keys:
+                    single_dict.update({key: self.variant_values[key][i]})
+                prompt = prompt.format(**single_dict)
+                prompt_id = self.prompt_id_generator(self.task_id,single_dict)
                 id_prompt_list.append((prompt_id,prompt))
+
             return id_prompt_list
         else:
             return [(self.name, self.template)]
@@ -260,6 +269,9 @@ def write_csv(self, target_folder):
         '''
         write the task to a csv file with a task.txt template file
         '''
+        # Create task folder 
+        os.mkdir(os.path.join(target_folder, self.task_id))
+
         # write the template 
         with open(os.path.join(target_folder,self.task_id, 'template.txt'), 'w') as f:
             f.write(self.template)
@@ -358,6 +370,24 @@ def run(self, runner=BenchRunner(), log_dir='logs', benchmark=None, bench_path=N
                         )
                         response = chat_completion.choices[0].message.content
                         responses.append(response)
+                    case "bedrock":
+                        bedrock_client = boto3.client('bedrock-runtime')
+                        completeion = bedrock_client.invoke_model(
+                            modelId = runner.model,
+                            body = json.dumps(
+                                {
+                                    'messages': [
+                                        {
+                                        'role': 'user',
+                                        'content': sub_task
+                                        }
+                                    ]
+                                }
+                            )
+                        )
+                        response = json.loads(completeion['body'].read())
+                        response = response['choices'][0]['message']['content']
+                        responses.append(response)
                     case _:
                         print(f"Runner type {runner.runner_type} not supported")
                         return None

diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,8 @@ dependencies = [
   "pandas",
   "datasets",
   "openai",
-  "ollama"
+  "ollama",
+  "boto3"
 ]
 requires-python = ">=3.10"
 authors = [