Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions eval/chat_benchmarks/AIME24/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
examples = self.load_questions()
# Prepare instances for model
all_outputs = []

all_instances = []
for i in range(self.n_repeat):
all_instances = []
seed = [s + i for s in self.seed]

for idx, example in enumerate(examples):
Expand Down Expand Up @@ -98,10 +97,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:

all_instances.append(instance)

# Generate model responses
self.logger.info("Generating responses for AIME24...")
outputs = self.compute(model, all_instances)
all_outputs.append(outputs)
# Generate model responses
self.logger.info("Generating responses for AIME24...")
all_outputs = self.compute(model, all_instances)

all_outputs = [
all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples))
]
# Return None early for non-primary ranks
if model.rank != 0:
return None
Expand Down
14 changes: 8 additions & 6 deletions eval/chat_benchmarks/AIME25/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
examples = self.load_questions()
# Prepare instances for model
all_outputs = []

all_instances = []
for i in range(self.n_repeat):
all_instances = []
seed = [s + i for s in self.seed]

for idx, example in enumerate(examples):
Expand Down Expand Up @@ -97,10 +96,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:

all_instances.append(instance)

# Generate model responses
self.logger.info("Generating responses for AIME25...")
outputs = self.compute(model, all_instances)
all_outputs.append(outputs)
# Generate model responses
self.logger.info("Generating responses for AIME25...")
all_outputs = self.compute(model, all_instances)

all_outputs = [
all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples))
]
# Return None early for non-primary ranks
if model.rank != 0:
return None
Expand Down
12 changes: 7 additions & 5 deletions eval/chat_benchmarks/AMC23/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
model_name = model.model_args["model"]

all_outputs = []
all_instances = []
for i in range(self.n_repeat):
seed = [s + i for s in self.seed]
all_instances = []
for idx, example in enumerate(examples):
messages = [
{"role": "user", "content": PROMPT.format(problem=example["question"])},
Expand Down Expand Up @@ -107,11 +107,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:

all_instances.append(instance)

# Generate model responses
self.logger.info("Generating responses for AMC23...")
outputs = self.compute(model, all_instances)
all_outputs.append(outputs)
# Generate model responses
self.logger.info("Generating responses for AMC23...")
all_outputs = self.compute(model, all_instances)

all_outputs = [
all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples))
]
# Return None early for non-primary ranks
if model.rank != 0:
return None
Expand Down
14 changes: 8 additions & 6 deletions eval/chat_benchmarks/CodeElo/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,8 @@ def make_html_problem(problem):
return html_output

instruction = """"You are a coding expert. Given a competition-level coding problem, you need to write a Python program to solve it. You may start by outlining your thought process. In the end, please provide the complete code in a code block enclosed with ``` ```. The code should take stdin as input and print the output."""

all_instances = []
for i in range(self.n_repeat):
all_instances = []
seed = [s + i for s in self.seed]

for idx, example in enumerate(examples):
Expand All @@ -158,10 +157,13 @@ def make_html_problem(problem):
instance.repeat_idx = i
all_instances.append(instance)

# Generate model responses
self.logger.info("Generating responses for CodeElo...")
outputs = self.compute(model, all_instances)
all_outputs.append(outputs)
# Generate model responses
self.logger.info("Generating responses for CodeElo...")
all_outputs = self.compute(model, all_instances)

all_outputs = [
all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples))
]

# Return None early for non-primary ranks
if model.rank != 0:
Expand Down
13 changes: 8 additions & 5 deletions eval/chat_benchmarks/CodeForces/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ def make_html_problem(problem):

instruction = """You are a coding expert. Given a competition-level coding problem, you need to write a Python program to solve it. You may start by outlining your thought process. In the end, please provide the complete code in a code block enclosed with ``` ```. The code should take stdin as input and print the output. Your program should be a Python function generated from the given prompt. Simply call the function after the definition."""

all_instances = []
for i in range(self.n_repeat):
all_instances = []
seed = [s + i for s in self.seed]

for idx, example in enumerate(examples):
Expand All @@ -150,10 +150,13 @@ def make_html_problem(problem):
instance.repeat_idx = i
all_instances.append(instance)

# Generate model responses
self.logger.info("Generating responses for CodeForces...")
outputs = self.compute(model, all_instances)
all_outputs.append(outputs)
# Generate model responses
self.logger.info("Generating responses for CodeForces...")
all_outputs = self.compute(model, all_instances)

all_outputs = [
all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples))
]

# Return None early for non-primary ranks
if model.rank != 0:
Expand Down
14 changes: 8 additions & 6 deletions eval/chat_benchmarks/GPQADiamond/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
model_name = model.model_args["model"]

all_outputs = []

all_instances = []
for i in range(self.n_repeat):
all_instances = []
seed = [s + i for s in self.seed]

for idx, example in enumerate(examples):
Expand Down Expand Up @@ -111,10 +110,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
instance.repeat_idx = i
all_instances.append(instance)

# Generate model responses
self.logger.info("Generating responses for GPQADiamond...")
outputs = self.compute(model, all_instances)
all_outputs.append(outputs)
# Generate model responses
self.logger.info("Generating responses for GPQADiamond...")
all_outputs = self.compute(model, all_instances)

all_outputs = [
all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples))
]

# Return None early for non-primary ranks
if model.rank != 0:
Expand Down
14 changes: 8 additions & 6 deletions eval/chat_benchmarks/HLE/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:

# Prepare instances for model
all_outputs = []

all_instances = []
for i in range(self.n_repeat):
all_instances = []
seed = [s + i for s in self.seed]

for idx, example in enumerate(examples):
Expand Down Expand Up @@ -133,10 +132,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:

all_instances.append(instance)

# Generate model responses
self.logger.info("Generating responses for HLE...")
outputs = self.compute(model, all_instances)
all_outputs.append(outputs)
# Generate model responses
self.logger.info("Generating responses for HLE...")
all_outputs = self.compute(model, all_instances)

all_outputs = [
all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples))
]
# Return None early for non-primary ranks
if model.rank != 0:
return None
Expand Down
14 changes: 8 additions & 6 deletions eval/chat_benchmarks/HMMT/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
examples = self.load_questions()
# Prepare instances for model
all_outputs = []

all_instances = []
for i in range(self.n_repeat):
all_instances = []
seed = [s + i for s in self.seed]

for idx, example in enumerate(examples):
Expand Down Expand Up @@ -101,10 +100,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:

all_instances.append(instance)

# Generate model responses
self.logger.info("Generating responses for HMMT...")
outputs = self.compute(model, all_instances)
all_outputs.append(outputs)
# Generate model responses
self.logger.info("Generating responses for HMMT...")
all_outputs = self.compute(model, all_instances)

all_outputs = [
all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples))
]
# Return None early for non-primary ranks
if model.rank != 0:
return None
Expand Down
14 changes: 8 additions & 6 deletions eval/chat_benchmarks/JEEBench/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:

# Prepare instances for model
all_outputs = []

all_instances = []
for i in range(self.n_repeat):
all_instances = []
seed = [s + i for s in self.seed]

for idx, example in enumerate(examples):
Expand Down Expand Up @@ -154,10 +153,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:

all_instances.append(instance)

# Generate model responses
self.logger.info("Generating responses for JEEBench...")
outputs = self.compute(model, all_instances)
all_outputs.append(outputs)
# Generate model responses
self.logger.info("Generating responses for JEEBench...")
all_outputs = self.compute(model, all_instances)

all_outputs = [
all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples))
]
# Return None early for non-primary ranks
if model.rank != 0:
return None
Expand Down
14 changes: 8 additions & 6 deletions eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
examples = examples[:10]

all_outputs = []

all_instances = []
for i in range(self.n_repeat):
all_instances = []
seed = [s + i for s in self.seed]

for idx, example in enumerate(examples):
Expand Down Expand Up @@ -122,10 +121,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
instance.repeat_idx = i
all_instances.append(instance)

# Generate model responses
self.logger.info("Generating responses for LiveCodeBench...")
outputs = self.compute(model, all_instances)
all_outputs.append(outputs)
# Generate model responses
self.logger.info("Generating responses for LiveCodeBench...")
all_outputs = self.compute(model, all_instances)

all_outputs = [
all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples))
]

# Return None early for non-primary ranks
if model.rank != 0:
Expand Down
13 changes: 7 additions & 6 deletions eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
examples = examples[:10]

all_outputs = []

all_instances = []
for i in range(self.n_repeat):
all_instances = []
seed = [s + i for s in self.seed]

for idx, example in enumerate(examples):
Expand Down Expand Up @@ -118,11 +117,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
instance.repeat_idx = i
all_instances.append(instance)

# Generate model responses
self.logger.info("Generating responses for LiveCodeBenchV5...")
outputs = self.compute(model, all_instances)
all_outputs.append(outputs)
# Generate model responses
self.logger.info("Generating responses for LiveCodeBenchV5...")
all_outputs = self.compute(model, all_instances)

all_outputs = [
all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples))
]
# Return None early for non-primary ranks
if model.rank != 0:
return None
Expand Down