ml4sts
diff --git a/‎benchtools/benchmark.py‎
Lines changed: 70 additions & 29 deletions b/‎benchtools/benchmark.py‎
Lines changed: 70 additions & 29 deletions
diff --git a/‎benchtools/betterbench.py‎
Lines changed: 4 additions & 5 deletions b/‎benchtools/betterbench.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎benchtools/cli.py‎
Lines changed: 37 additions & 7 deletions b/‎benchtools/cli.py‎
Lines changed: 37 additions & 7 deletions
@@ -5,8 +5,11 @@
 import requests
 import yaml
 # from pathlib import Path # ???
-from .task import Task
+from benchtools.task import Task
 from pathlib import PurePath
+from benchtools.runner import BenchRunner
+from .utils import load_asset
+
 
 about_template = """# {bench_name}
 
@@ -42,7 +45,7 @@ class Bench():
     run()
         Run one task or all tasks of the benchmark.
     '''
-    def __init__(self, name, base_path='.', bench_path=None, concept = None, tasks=[]):
+    def __init__(self, name, base_path='.', bench_path=None, concept=None, tasks=[]):
         '''
         Initialize the benchmark object with the name and path to the benchmark folder.
 
@@ -52,11 +55,10 @@ def __init__(self, name, base_path='.', bench_path=None, concept = None, tasks=[
             name of the benchmark will be used for folder
         path: str or buffer
             path where the benchmark will be stored 
-        
         tasks: list of Task objects
-            list of tasks to be included in the benchmark. Each task should be an instance of the
-        
+            list of tasks to be included in the benchmark. Each task should be an instance of the Task class
         '''
+
         # set up the object attributes
         self.display_name = name.strip()
         self.concept  = concept if concept else f'a benchmark about {name.strip()}'
@@ -66,21 +68,26 @@ def __init__(self, name, base_path='.', bench_path=None, concept = None, tasks=[
             self.base_path = PurePath(bench_path).parent
             self.bench_path = bench_path
         else:
+            # TODO: this way we don't have a base_path if above were true
             self.base_path = base_path
             self.bench_path = os.path.join(base_path, self.bench_name)
 
         self.tasks_folder = os.path.join(self.bench_path, 'tasks')
         if tasks:
-            self.tasks = {t.name:t for t in tasks} # initialize a task object for each task.
+            # All task objects have to be initialized before adding them to a benchmark
+            self.tasks = {t.name:t for t in tasks} 
         else:
             self.tasks = {}
 
+        # Written if the benchmark directory has been initialized
         self.written = os.path.exists(self.bench_path)
 
+    
     @classmethod
     def from_folders(cls, bench_path):
         '''
-        Load a benchmark from a given path. The path should point to the benchmark folder.
+        Load a benchmark object from a given path. 
+        The path should point to the benchmark folder.
 
         Parameters:
         -----------
@@ -113,7 +120,12 @@ def from_folders(cls, bench_path):
             for task_dir in task_list:
                 # load the tasks
                 task_path = os.path.join(task_folder, task_dir)
-                task = Task.from_txt_csv(task_path)
+                task_content = os.listdir(task_path)
+                if 'task_info.yml' in task_content:
+                    task_info_file = os.path.join(task_path, 'task_info.yml')
+                    task = Task.from_dict(task_info_file)
+                else:
+                    task = Task.from_txt_csv(task_path)
                 tasks.append(task)
         else:
             tasks = []
@@ -126,7 +138,7 @@ def from_folders(cls, bench_path):
     @classmethod
     def from_yaml(cls, bench_path):
         """
-        Load tasks from a YAML file and generate Task objects.
+        Load tasks from a YAML file and generate Task objects and add them to the bench
 
         Parameters
         ----------
@@ -185,12 +197,9 @@ def initialize_dir(self, no_git=False):
         # Create a benchmarks folder with tasks in them
         tasks_path = os.path.join(self.bench_path, "tasks")
         os.mkdir(tasks_path)
-        log_path = os.path.join(self.bench_path, "logs") # Do we want a log dir?
-        os.mkdir(log_path) # Do we want a log dir?
 
         # Create about.md
         about_path = os.path.join(self.bench_path, "about.md")
-        
         about_body = f"*{self.concept}*"
         about_text= about_template.format(bench_name=self.bench_name, 
                                            text = about_body)
@@ -252,36 +261,67 @@ def init_repo(self, bench_path):
         except:
             print("git might not be initialized in your system. Please run \"git init . \" when setup")
         # Get python gitignore template and create .gitignore
-        ignore_text = requests.get("https://raw.githubusercontent.com/github/gitignore/refs/heads/main/Python.gitignore")
-        if ignore_text.status_code == 200:
-            with open(".gitignore", 'a') as f:
-                f.write(ignore_text.text)
+        ignore_text = load_asset('.gitignore')
+        # ignore_text = requests.get("https://raw.githubusercontent.com/github/gitignore/refs/heads/main/Python.gitignore")
+        # if ignore_text.status_code == 200:
+        with open(".gitignore", 'a') as f:
+            f.write(ignore_text)
         os.chdir(current_dir)
 
 
-    def add_task(self, task_object):
-        # TODO: Look at content to create Task objects and add them to tasks
-        # setup_task(self.tasks_folder, task_name, task_source))
+    def add_task(self, task_object:Task):
 
-        # self.tasks.append(task)
+        # Add task object to bench's tasks
         self.tasks[task_object.name] = task_object
 
+        # Check if written or not to write the task in the directory
+        if self.written:
+            task_folder = os.path.join(self.tasks_folder, task_object.id)
+            if not os.path.exists(task_folder):
+                os.mkdir(task_folder)
+            else:
+                # TODO: What happens if true?
+                pass
+            task_object.write(task_folder)
 
-    def run(self,model='gemma3',runner_type="ollama", api_url=None,):
+
+    def run(self, runner=BenchRunner(), log_dir=None):
         '''
         Run the benchmark by running each task in the benchmark and logging the interactions.
         Parameters:
         -----------
-        model: str default 'gemma3'
-            The name of the model to use for running the tasks. Default is 'gemma3'.
+        runner: BenchRunner 
+            define which runner should be used for the task.
+        
+            runner.model : string
+                the model to run the task on
+            runner.api_url : string
+                the url of the api to use for the task
+            runner.runner_type: {ollama,openai}
+                to use the Ollama runner, the script expects the model to be installed, and `ollama serve` running on localhost:11434
+                to use OpenAI runner, you must have an API key set in your OPENAI_API_KEY environment variable
+        log_dir: str
+            Path to where the logs should be saved
         '''
-        if not self.written:
+        if not log_dir and not self.written:
             raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.")
-        # TODO deal with results 
+        
+        # Run each task
         for name, task in self.tasks.items():
-            self.run_task(task, model,runner_type, api_url)
+            self.run_task(task, runner, log_dir)
+
+
+    def run_task(self, target_task=None, runner=BenchRunner(), log_dir=None): 
+        '''
+        run a specific task
+        '''
+        if not log_dir and not self.written:
+            raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.")
+
+        # If user doesn't specify a log_dir, default to logs folder inside bench folder
+        if not log_dir:
+            log_dir = os.path.join(self.bench_path, 'logs')
 
-    def run_task(self, target_task=None, model='gemma3',runner_type="ollama", api_url=None): 
         if not(target_task):
             # TODO: use a generator and make this have a state
             target_task = list[self.tasks.keys()][0] 
@@ -292,8 +332,9 @@ def run_task(self, target_task=None, model='gemma3',runner_type="ollama", api_ur
             task_object = target_task
         else:
             raise ValueError("target_task should be either a string (task name) or a Task object.")
+
+        # TODO: Add log_dir to attributes?
 
-        logging_path = os.path.join(self.bench_path, 'logs')
-        return task_object.run(model,runner_type, api_url,logging_path)
+        return task_object.run(runner, log_dir, self.bench_name, self.bench_path)
 
 
@@ -94,8 +94,7 @@ def better_session(bench_path) -> dict:
     # Loop until user opts out 
     for question, criteria in main_checklist.items(): 
         # TODO: add if(bench_checklist[skipped])
-        # print(question) # DEbugging
-        # # print(vals)
+        
         if len(criteria) == 4:
             choice = click.prompt(f"{question}?\nEnter to skip. q to end this session...", type=click.Choice(["yes", "no", 'q', ''], case_sensitive=False), show_choices=True, default='')
         else:
@@ -114,7 +113,7 @@ def better_session(bench_path) -> dict:
                 score=0,
                 )
                 bench_checklist[question] = yaml.dump(item)
-                print(bench_checklist[question])
+                click.echo(bench_checklist[question])
             case 'yes':
                 score = click.prompt(f"Please pick score level:\n0- {criteria[0]}\n5- {criteria[1]}\n10- {criteria[2]}\n15- {criteria[3]}\n", type=click.Choice([0, 5, 10, 15]), show_choices=True, default=5)
                 justification = click.prompt("Justification? ")
@@ -125,7 +124,7 @@ def better_session(bench_path) -> dict:
                 score=score,
                 )
                 bench_checklist[question] = yaml.dump(item)
-                print(bench_checklist[question])
+                click.echo(bench_checklist[question])
             case '':
                 continue
 
@@ -137,7 +136,7 @@ def better_session(bench_path) -> dict:
 
 
 
-    print(checklist_path) #debugging 
+    
     # Save current checklist into the benchmark repo
     if os.path.exists(checklist_path):
         with open(checklist_path, 'w') as f:
 
@@ -97,8 +97,9 @@ def init(benchmark_name, path, about, no_git):
 @benchtool.command()
 @click.argument('task-name',  required = True, type=str, ) 
 @click.option('-p','--benchmark-path', default='.', help="The path to the benchmark repository where the task will be added.", type=str)
-@click.option('-s','task-source', type=str,help="The relative path to  content that already exists`", required=True)
-@click.option('-t','--task-type', type=click.Choice(['folders', 'list']), help="The type of the task content being added. Options are csv or yml", required=True)
+@click.option('-s','--task-source', type=str,help="The relative path to  content that already exists`", required=True)
+@click.option('-t','--task-type', type=click.Choice(['folders', 'list']), 
+              help="The type of the task content being added. Options are csv or yml", required=True)
 def add_task(task_name, bench_path, task_source,task_type):
     """
     Set up a new task.
@@ -135,25 +136,54 @@ def add_task(task_name, bench_path, task_source,task_type):
 @benchtool.command()
 @click.argument('benchmark-path', required = True, type=str)
 @click.argument('task_name', required = True)
-def run_task(benchmark_path: str, task_name):
+@click.option('-r', '--runner-type', type=click.Choice(['ollama', 'openai', 'aws']), 
+                                                       default="ollama", help="The engine that will run your LLM.")
+@click.option('-m', '--model', type=str, default="gemma3",
+               help="The LLM to be benchmarked.")
+@click.option('-a', '--api-url', type=str, default=None, 
+              help="The api call required to access the runner engine.")
+@click.option('-l', '--log-path', type=str, default=None,
+               help="The path to a log directory.")
+def run_task(benchmark_path: str, task_name, runner_type, model, api_url, log_path):
     """
     Running the tasks and generating logs
 
     , help="The path to the benchmark repository where all the task reside."
     , help="The name of the specific task you would like to run"
     """
 
-    benchmark = Bench.load(benchmark_path)
+    # Create BenchRunner object
+    runner = BenchRunner(runner_type, model, api_url)
+
+    # check folder to see if folder or yaml type to load benchmark
+    if os.path.isdir(benchmark_path):
+        content = os.listdir(benchmark_path)
+        if 'tasks.yml' in content:
+            benchmark = Bench.from_yaml(benchmark_path)
+        else:
+            benchmark = Bench.from_folders(benchmark_path)
+
     click.echo(f"Running {task_name} now")
-    benchmark.run([task_name])
+    benchmark.run_task(task_name, runner, log_path)
 
 @benchtool.command()
 @click.argument('benchmark-path', required = True, type=str)
-def run(benchmark_path: str):
+@click.option('-r', '--runner-type', type=click.Choice(['ollama', 'openai', 'aws']),
+               default="ollama", help="The engine that will run your LLM.")
+@click.option('-m', '--model', type=str, default="gemma3", 
+              help="The LLM to be benchmarked.")
+@click.option('-a', '--api-url', type=str, default=None, 
+              help="The api call required to access the runner engine.")
+@click.option('-l', '--log-path', type=str, default=None, 
+              help="The path to a log directory.")
+def run(benchmark_path: str, runner_type: str, model: str, api_url: str, log_path: str):
     """
     Running the benchmark and generating logs
     , help="The path to the benchmark repository where all the task reside."
     """
+    # Create BenchRunner object
+    runner = BenchRunner(runner_type, model, api_url)
+
     # check folder to see if folder or yaml type to load benchmark
     if os.path.isdir(benchmark_path):
         content = os.listdir(benchmark_path)
@@ -162,7 +192,7 @@ def run(benchmark_path: str):
         else:
             benchmark = Bench.from_folders(benchmark_path)
     click.echo(f"Running {benchmark.bench_name} now")
-    benchmark.run()
+    benchmark.run(runner, log_path)
 
 
 @click.group()