From 915fa76694c24e7df289a27d7c8f1487631a9b7f Mon Sep 17 00:00:00 2001 From: alex costea Date: Wed, 10 Dec 2025 12:32:55 -0500 Subject: [PATCH 01/19] add test for date question --- .../test_forecast_bots_live.py | 20 ++++++++++++++++++- .../data_models/data_organizer.py | 2 +- .../forecast_bots/forecast_bot.py | 7 ++++++- .../official_bots/fall_template_bot.py | 5 +++++ 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/code_tests/integration_tests/test_forecast_bots_live.py b/code_tests/integration_tests/test_forecast_bots_live.py index 19f29c2..42ce000 100644 --- a/code_tests/integration_tests/test_forecast_bots_live.py +++ b/code_tests/integration_tests/test_forecast_bots_live.py @@ -7,7 +7,7 @@ import typeguard from code_tests.unit_tests.forecasting_test_manager import ForecastingTestManager -from forecasting_tools import MetaculusClient +from forecasting_tools import MetaculusClient, NumericDistribution from forecasting_tools.ai_models.general_llm import GeneralLlm from forecasting_tools.ai_models.resource_managers.monetary_cost_manager import ( MonetaryCostManager, @@ -113,6 +113,24 @@ async def test_taiwan_tournament_uniform_probability_bot() -> None: ), "Expected some conditional reports" +async def test_date_question() -> None: + bot = TemplateBot( + publish_reports_to_metaculus=True, + skip_previously_forecasted_questions=False, + llms={ + "default": GeneralLlm(model="openai/o4-mini", temperature=1), + "summarizer": GeneralLlm(model="openai/o4-mini", temperature=1), + "researcher": GeneralLlm(model="openai/o4-mini", temperature=1), + "parser": GeneralLlm(model="openai/o4-mini", temperature=1), + }, + ) + url = "https://www.metaculus.com/questions/7104/birthdate-of-the-first-human-to-live-to-1000/" + question = MetaculusClient().get_question_by_url(url) + assert isinstance(question, DateQuestion) + report = await bot.forecast_question(question) + assert isinstance(report.prediction, NumericDistribution) + + async def test_conditional_forecasts() -> None: bot = TemplateBot( publish_reports_to_metaculus=True, diff --git a/forecasting_tools/data_models/data_organizer.py b/forecasting_tools/data_models/data_organizer.py index ff4fb7f..2193520 100644 --- a/forecasting_tools/data_models/data_organizer.py +++ b/forecasting_tools/data_models/data_organizer.py @@ -64,7 +64,7 @@ class DataOrganizer: TypeMapping( question_type=DateQuestion, test_post_id=4110, # https://www.metaculus.com/questions/4110/birthdate-of-oldest-living-human-in-2200/ - report_type=None, # Not Implemented Yet + report_type=NumericReport, ), TypeMapping( question_type=MultipleChoiceQuestion, diff --git a/forecasting_tools/forecast_bots/forecast_bot.py b/forecasting_tools/forecast_bots/forecast_bot.py index ee26d64..0224704 100644 --- a/forecasting_tools/forecast_bots/forecast_bot.py +++ b/forecasting_tools/forecast_bots/forecast_bot.py @@ -523,7 +523,7 @@ async def _make_prediction( elif isinstance(question, ConditionalQuestion): forecast_function = lambda q, r: self._run_forecast_on_conditional(q, r) elif isinstance(question, DateQuestion): - raise NotImplementedError("Date questions not supported yet") + forecast_function = lambda q, r: self._run_forecast_on_date(q, r) else: raise ValueError(f"Unknown question type: {type(question)}") @@ -542,6 +542,11 @@ async def _run_forecast_on_multiple_choice( ) -> ReasonedPrediction[PredictedOptionList]: raise NotImplementedError("Subclass must implement this method") + async def _run_forecast_on_date( + self, question: NumericQuestion, research: str + ) -> ReasonedPrediction[NumericDistribution]: + raise NotImplementedError("Subclass must implement this method") + async def _run_forecast_on_conditional( self, question: ConditionalQuestion, research: str ) -> ReasonedPrediction[ConditionalPrediction]: diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py index fd34960..5abbf1d 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py @@ -304,6 +304,11 @@ async def _run_forecast_on_binary( return await self._binary_prompt_to_forecast(question, prompt) + async def _run_forecast_on_date( + self, question: NumericQuestion, research: str + ) -> ReasonedPrediction[NumericDistribution]: + raise NotImplementedError("Implement numeric forecast") + async def _binary_prompt_to_forecast( self, question: BinaryQuestion, From fc6bc51d6db5d6c81342be34f0de41902d64b8de Mon Sep 17 00:00:00 2001 From: alex costea Date: Wed, 10 Dec 2025 13:38:20 -0500 Subject: [PATCH 02/19] add date question forecasting --- .../data_models/numeric_report.py | 3 +- forecasting_tools/data_models/questions.py | 1 + .../forecast_bots/forecast_bot.py | 2 +- .../official_bots/fall_template_bot.py | 139 ++++++++++++++++-- .../helpers/prediction_extractor.py | 6 +- 5 files changed, 134 insertions(+), 17 deletions(-) diff --git a/forecasting_tools/data_models/numeric_report.py b/forecasting_tools/data_models/numeric_report.py index c9b895f..74926a0 100644 --- a/forecasting_tools/data_models/numeric_report.py +++ b/forecasting_tools/data_models/numeric_report.py @@ -12,6 +12,7 @@ if TYPE_CHECKING: from forecasting_tools.data_models.questions import ( + DateQuestion, DiscreteQuestion, NumericQuestion, ) @@ -231,7 +232,7 @@ def _check_distribution_too_tall(self, cdf: list[Percentile]) -> None: def from_question( cls, percentiles: list[Percentile], - question: NumericQuestion, + question: NumericQuestion | DateQuestion, standardize_cdf: bool | None = None, ) -> NumericDistribution: if standardize_cdf is None: diff --git a/forecasting_tools/data_models/questions.py b/forecasting_tools/data_models/questions.py index 7b562d0..e33aeb4 100644 --- a/forecasting_tools/data_models/questions.py +++ b/forecasting_tools/data_models/questions.py @@ -445,6 +445,7 @@ class DateQuestion(MetaculusQuestion, BoundedQuestionMixin): open_upper_bound: bool open_lower_bound: bool zero_point: float | None = None + cdf_size: int = 201 @model_validator(mode="before") @classmethod diff --git a/forecasting_tools/forecast_bots/forecast_bot.py b/forecasting_tools/forecast_bots/forecast_bot.py index 0224704..defcd6f 100644 --- a/forecasting_tools/forecast_bots/forecast_bot.py +++ b/forecasting_tools/forecast_bots/forecast_bot.py @@ -543,7 +543,7 @@ async def _run_forecast_on_multiple_choice( raise NotImplementedError("Subclass must implement this method") async def _run_forecast_on_date( - self, question: NumericQuestion, research: str + self, question: DateQuestion, research: str ) -> ReasonedPrediction[NumericDistribution]: raise NotImplementedError("Subclass must implement this method") diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py index 5abbf1d..c358aed 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py @@ -18,6 +18,7 @@ from forecasting_tools.data_models.questions import ( BinaryQuestion, ConditionalQuestion, + DateQuestion, MetaculusQuestion, MultipleChoiceQuestion, NumericQuestion, @@ -305,9 +306,113 @@ async def _run_forecast_on_binary( return await self._binary_prompt_to_forecast(question, prompt) async def _run_forecast_on_date( - self, question: NumericQuestion, research: str + self, question: DateQuestion, research: str ) -> ReasonedPrediction[NumericDistribution]: - raise NotImplementedError("Implement numeric forecast") + upper_bound_message, lower_bound_message = ( + self._create_upper_and_lower_bound_messages(question) + ) + prompt = clean_indents( + f""" + You are a professional forecaster interviewing for a job. + + Your interview question is: + {question.question_text} + + Background: + {question.background_info} + + {question.resolution_criteria} + + {question.fine_print} + + Your research assistant says: + {research} + + Today is {datetime.now().strftime("%Y-%m-%d")}. + + {lower_bound_message} + {upper_bound_message} + + Formatting Instructions: + - This is a date question, and as such, the answer must be expressed in terms of dates. + - The dates must be written in the format of YYYY-MM-DD. No other format is acceptable. + - You will not add time information to the dates. + - Always start with a lower date chronologically and then increase from there. + + Before answering you write: + (a) The time left until the outcome to the question is known. + (b) The outcome if nothing changed. + (c) The outcome if the current trend continued. + (d) The expectations of experts and markets. + (e) A brief description of an unexpected scenario that results in a low outcome. + (f) A brief description of an unexpected scenario that results in a high outcome. + {self._get_conditional_disclaimer_if_necessary(question)} + You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. + + The last thing you write is your final answer as: + " + Percentile 10: YYYY-MM-DD + Percentile 20: YYYY-MM-DD + Percentile 40: YYYY-MM-DD + Percentile 60: YYYY-MM-DD + Percentile 80: YYYY-MM-DD + Percentile 90: YYYY-MM-DD + " + """ + ) + return await self._date_prompt_to_forecast(question, prompt) + + async def _date_prompt_to_forecast( + self, + question: DateQuestion, + prompt: str, + double_check_extraction: bool = False, + ) -> ReasonedPrediction[NumericDistribution]: + reasoning = await self.get_llm("default", "llm").invoke(prompt) + logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") + parsing_instructions = clean_indents( + f""" + The text given to you is trying to give a forecast distribution for a date question. + - This text is trying to answer the numeric question: "{question.question_text}". + - As an example, someone else guessed that the answer will be between {question.lower_bound} and {question.upper_bound}, so the numbers parsed from and answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". + - The output is given as dates in the format of YYYY-MM-DD + - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. + """ + ) + percentile_list: list[Percentile] = await structure_output( + reasoning, + list[Percentile], + model=self.get_llm("parser", "llm"), + additional_instructions=parsing_instructions, + num_validation_samples=self._structure_output_validation_samples, + ) + + if double_check_extraction: + redundant_extraction = PredictionExtractor.extract_numeric_distribution_from_list_of_percentile_number_and_probability( + reasoning, question + ) + for redundant_percentile in redundant_extraction.declared_percentiles: + matching_original_percentile = next( + ( + percentile + for percentile in percentile_list + if abs(percentile.percentile - redundant_percentile.percentile) + < 0.001 + ), + None, + ) + assert ( + matching_original_percentile is not None + ), f"Matching original percentile not found for {redundant_percentile.percentile}" + assert ( + abs(redundant_percentile.value - matching_original_percentile.value) + < 0.001 + ), f"Redundant extraction {redundant_percentile.value} does not match original percentile {matching_original_percentile.value} for percentile {redundant_percentile.percentile}" + prediction = NumericDistribution.from_question(percentile_list, question) + logger.info( + f"Forecasted URL {question.page_url} with prediction: {prediction.declared_percentiles}." + ) + return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) async def _binary_prompt_to_forecast( self, @@ -552,26 +657,34 @@ async def _numeric_prompt_to_forecast( return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) def _create_upper_and_lower_bound_messages( - self, question: NumericQuestion + self, question: NumericQuestion | DateQuestion ) -> tuple[str, str]: - if question.nominal_upper_bound is not None: - upper_bound_number = question.nominal_upper_bound - else: + if isinstance(question, NumericQuestion): + if question.nominal_upper_bound is not None: + upper_bound_number = question.nominal_upper_bound + else: + upper_bound_number = question.upper_bound + if question.nominal_lower_bound is not None: + lower_bound_number = question.nominal_lower_bound + else: + lower_bound_number = question.lower_bound + unit_of_measure = question.unit_of_measure + elif isinstance(question, DateQuestion): upper_bound_number = question.upper_bound - if question.nominal_lower_bound is not None: - lower_bound_number = question.nominal_lower_bound - else: lower_bound_number = question.lower_bound + unit_of_measure = None + else: + raise ValueError() if question.open_upper_bound: - upper_bound_message = f"The question creator thinks the number is likely not higher than {upper_bound_number} {question.unit_of_measure}." + upper_bound_message = f"The question creator thinks the number is likely not higher than {upper_bound_number} {unit_of_measure}." else: - upper_bound_message = f"The outcome can not be higher than {upper_bound_number} {question.unit_of_measure}." + upper_bound_message = f"The outcome can not be higher than {upper_bound_number} {unit_of_measure}." if question.open_lower_bound: - lower_bound_message = f"The question creator thinks the number is likely not lower than {lower_bound_number} {question.unit_of_measure}." + lower_bound_message = f"The question creator thinks the number is likely not lower than {lower_bound_number} {unit_of_measure}." else: - lower_bound_message = f"The outcome can not be lower than {lower_bound_number} {question.unit_of_measure}." + lower_bound_message = f"The outcome can not be lower than {lower_bound_number} {unit_of_measure}." return upper_bound_message, lower_bound_message diff --git a/forecasting_tools/helpers/prediction_extractor.py b/forecasting_tools/helpers/prediction_extractor.py index 1b36854..61ecf27 100644 --- a/forecasting_tools/helpers/prediction_extractor.py +++ b/forecasting_tools/helpers/prediction_extractor.py @@ -10,7 +10,7 @@ PredictedOptionList, ) from forecasting_tools.data_models.numeric_report import NumericDistribution, Percentile -from forecasting_tools.data_models.questions import NumericQuestion +from forecasting_tools.data_models.questions import DateQuestion, NumericQuestion logger = logging.getLogger(__name__) @@ -227,7 +227,9 @@ def _normalize_option_probabilities( @staticmethod def extract_numeric_distribution_from_list_of_percentile_number_and_probability( - text: str, question: NumericQuestion, standardize_cdf: bool | None = None + text: str, + question: NumericQuestion | DateQuestion, + standardize_cdf: bool | None = None, ) -> NumericDistribution: if not text or text.strip() == "": raise ValueError( From 40f5ca289cf8253407c8960937a40d024a2f1c5d Mon Sep 17 00:00:00 2001 From: alex costea Date: Wed, 10 Dec 2025 14:07:38 -0500 Subject: [PATCH 03/19] add DateStringPercentile --- .../data_models/numeric_report.py | 39 ++++++++++++++++++- .../official_bots/fall_template_bot.py | 20 ++++++++-- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/forecasting_tools/data_models/numeric_report.py b/forecasting_tools/data_models/numeric_report.py index 74926a0..1ca3c13 100644 --- a/forecasting_tools/data_models/numeric_report.py +++ b/forecasting_tools/data_models/numeric_report.py @@ -1,6 +1,8 @@ from __future__ import annotations +import datetime import logging +import re from collections import Counter from typing import TYPE_CHECKING @@ -62,6 +64,37 @@ def validate_percentile(self: Percentile) -> Percentile: return self +class DateStringPercentile(BaseModel): + percentile: float = Field( + description="A number between 0 and 1 (e.g. '90% of people are age 60 or younger' translates to '0.9')", + ) + value: str = Field( + description="The number matching the percentile (e.g. '90% of people are age 60 or younger' translates to '60')", + ) + + def is_valid_date(self, date_string: str): + pattern = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$" + if re.match(pattern, date_string): + try: + datetime.datetime.strptime(date_string, "%Y-%m-%d") + return True + except ValueError: + return False + return False + + @model_validator(mode="after") + def validate_percentile(self: Percentile) -> Percentile: + if self.percentile < 0 or self.percentile > 1: + raise ValueError( + f"Percentile must be between 0 and 1, but was {self.percentile}" + ) + if np.isnan(self.percentile): + raise ValueError(f"Percentile must be a number, but was {self.percentile}") + if not self.is_valid_date(self.value): + raise ValueError(f"Date must be in YYYY-MM-DD format, but was {self.value}") + return self + + class NumericDistribution(BaseModel): declared_percentiles: list[Percentile] open_upper_bound: bool @@ -558,12 +591,14 @@ def _cdf_location_to_nominal_location(self, cdf_location: float) -> float: class NumericReport(ForecastReport): - question: NumericQuestion + question: NumericQuestion | DateQuestion prediction: NumericDistribution @classmethod async def aggregate_predictions( - cls, predictions: list[NumericDistribution], question: NumericQuestion + cls, + predictions: list[NumericDistribution], + question: NumericQuestion | DateQuestion, ) -> NumericDistribution: assert predictions, "No predictions to aggregate" cdfs = [prediction.get_cdf() for prediction in predictions] diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py index c358aed..1a9f17f 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py @@ -14,7 +14,11 @@ from forecasting_tools.data_models.data_organizer import PredictionTypes from forecasting_tools.data_models.forecast_report import ReasonedPrediction from forecasting_tools.data_models.multiple_choice_report import PredictedOptionList -from forecasting_tools.data_models.numeric_report import NumericDistribution, Percentile +from forecasting_tools.data_models.numeric_report import ( + DateStringPercentile, + NumericDistribution, + Percentile, +) from forecasting_tools.data_models.questions import ( BinaryQuestion, ConditionalQuestion, @@ -379,14 +383,24 @@ async def _date_prompt_to_forecast( - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. """ ) - percentile_list: list[Percentile] = await structure_output( + date_percentile_list: list[DateStringPercentile] = await structure_output( reasoning, - list[Percentile], + list[DateStringPercentile], model=self.get_llm("parser", "llm"), additional_instructions=parsing_instructions, num_validation_samples=self._structure_output_validation_samples, ) + percentile_list = [ + Percentile( + percentile=percentile.percentile, + value=datetime.strptime( + f"{percentile.value} UTC", "%Y-%m-%d" + ).timestamp(), + ) + for percentile in date_percentile_list + ] + if double_check_extraction: redundant_extraction = PredictionExtractor.extract_numeric_distribution_from_list_of_percentile_number_and_probability( reasoning, question From 853322db1956f6326a766014cf1e2ee7dcc5c4a9 Mon Sep 17 00:00:00 2001 From: alex costea Date: Wed, 10 Dec 2025 14:08:34 -0500 Subject: [PATCH 04/19] fix --- forecasting_tools/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/forecasting_tools/__init__.py b/forecasting_tools/__init__.py index c1f2adb..0a024b0 100644 --- a/forecasting_tools/__init__.py +++ b/forecasting_tools/__init__.py @@ -108,6 +108,7 @@ ) from forecasting_tools.data_models.numeric_report import NumericReport as NumericReport from forecasting_tools.data_models.questions import BinaryQuestion as BinaryQuestion +from forecasting_tools.data_models.questions import DateQuestion as DateQuestion from forecasting_tools.data_models.questions import DiscreteQuestion as DiscreteQuestion from forecasting_tools.data_models.questions import ( MetaculusQuestion as MetaculusQuestion, From 01e73a602d1d0cb285cc1aeb341058a2d2da87de Mon Sep 17 00:00:00 2001 From: alex costea Date: Wed, 10 Dec 2025 14:10:19 -0500 Subject: [PATCH 05/19] fix formatting --- .../forecast_bots/official_bots/fall_template_bot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py index 1a9f17f..49606d9 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py @@ -684,8 +684,8 @@ def _create_upper_and_lower_bound_messages( lower_bound_number = question.lower_bound unit_of_measure = question.unit_of_measure elif isinstance(question, DateQuestion): - upper_bound_number = question.upper_bound - lower_bound_number = question.lower_bound + upper_bound_number = question.upper_bound.date().isoformat() + lower_bound_number = question.lower_bound.date().isoformat() unit_of_measure = None else: raise ValueError() From d0793c2d99543f340552c3c71ec7140906c1c5d5 Mon Sep 17 00:00:00 2001 From: alex costea Date: Wed, 10 Dec 2025 14:13:09 -0500 Subject: [PATCH 06/19] fix for NumericDistribution.from_question --- forecasting_tools/data_models/numeric_report.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/forecasting_tools/data_models/numeric_report.py b/forecasting_tools/data_models/numeric_report.py index 1ca3c13..0c5e4a3 100644 --- a/forecasting_tools/data_models/numeric_report.py +++ b/forecasting_tools/data_models/numeric_report.py @@ -268,13 +268,21 @@ def from_question( question: NumericQuestion | DateQuestion, standardize_cdf: bool | None = None, ) -> NumericDistribution: + if isinstance(question, NumericQuestion): + upper_bound = question.upper_bound + lower_bound = question.lower_bound + elif isinstance(question, DateQuestion): + upper_bound = question.upper_bound.timestamp() + lower_bound = question.lower_bound.timestamp() + else: + raise ValueError() if standardize_cdf is None: return NumericDistribution( declared_percentiles=percentiles, open_upper_bound=question.open_upper_bound, open_lower_bound=question.open_lower_bound, - upper_bound=question.upper_bound, - lower_bound=question.lower_bound, + upper_bound=upper_bound, + lower_bound=lower_bound, zero_point=question.zero_point, cdf_size=question.cdf_size, ) @@ -283,8 +291,8 @@ def from_question( declared_percentiles=percentiles, open_upper_bound=question.open_upper_bound, open_lower_bound=question.open_lower_bound, - upper_bound=question.upper_bound, - lower_bound=question.lower_bound, + upper_bound=upper_bound, + lower_bound=lower_bound, zero_point=question.zero_point, cdf_size=question.cdf_size, standardize_cdf=standardize_cdf, From 73ddbc25f9c77befdd9b5b97d2c6f942c8030cd2 Mon Sep 17 00:00:00 2001 From: alex costea Date: Wed, 10 Dec 2025 14:19:52 -0500 Subject: [PATCH 07/19] timezone fix --- .../forecast_bots/official_bots/fall_template_bot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py index 49606d9..cb9935d 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py @@ -395,7 +395,7 @@ async def _date_prompt_to_forecast( Percentile( percentile=percentile.percentile, value=datetime.strptime( - f"{percentile.value} UTC", "%Y-%m-%d" + f"{percentile.value} UTC", "%Y-%m-%d %Z" ).timestamp(), ) for percentile in date_percentile_list From 5fbc0039fc38c52c21b7149c3a636ce3ab1152b9 Mon Sep 17 00:00:00 2001 From: alex costea Date: Wed, 10 Dec 2025 14:46:39 -0500 Subject: [PATCH 08/19] fix for from_question --- forecasting_tools/data_models/numeric_report.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/forecasting_tools/data_models/numeric_report.py b/forecasting_tools/data_models/numeric_report.py index 0c5e4a3..27a4f0a 100644 --- a/forecasting_tools/data_models/numeric_report.py +++ b/forecasting_tools/data_models/numeric_report.py @@ -268,14 +268,14 @@ def from_question( question: NumericQuestion | DateQuestion, standardize_cdf: bool | None = None, ) -> NumericDistribution: - if isinstance(question, NumericQuestion): - upper_bound = question.upper_bound - lower_bound = question.lower_bound - elif isinstance(question, DateQuestion): - upper_bound = question.upper_bound.timestamp() - lower_bound = question.lower_bound.timestamp() - else: - raise ValueError() + + upper_bound = question.upper_bound + if isinstance(upper_bound, datetime.datetime): + upper_bound = upper_bound.timestamp() + lower_bound = question.lower_bound + if isinstance(lower_bound, datetime.datetime): + lower_bound = lower_bound.timestamp() + if standardize_cdf is None: return NumericDistribution( declared_percentiles=percentiles, From f3b510028e3e0b464a44f707664085154e1c218a Mon Sep 17 00:00:00 2001 From: alex costea Date: Mon, 15 Dec 2025 14:13:29 -0500 Subject: [PATCH 09/19] nits --- forecasting_tools/__init__.py | 3 ++ .../data_models/numeric_report.py | 6 ++-- .../official_bots/fall_template_bot.py | 33 ++++--------------- 3 files changed, 13 insertions(+), 29 deletions(-) diff --git a/forecasting_tools/__init__.py b/forecasting_tools/__init__.py index 0a024b0..ee2aed8 100644 --- a/forecasting_tools/__init__.py +++ b/forecasting_tools/__init__.py @@ -100,6 +100,9 @@ from forecasting_tools.data_models.multiple_choice_report import ( PredictedOptionList as PredictedOptionList, ) +from forecasting_tools.data_models.numeric_report import ( + DateStringPercentile as DateStringPercentile, +) from forecasting_tools.data_models.numeric_report import ( DiscreteReport as DiscreteReport, ) diff --git a/forecasting_tools/data_models/numeric_report.py b/forecasting_tools/data_models/numeric_report.py index 27a4f0a..371595d 100644 --- a/forecasting_tools/data_models/numeric_report.py +++ b/forecasting_tools/data_models/numeric_report.py @@ -66,10 +66,10 @@ def validate_percentile(self: Percentile) -> Percentile: class DateStringPercentile(BaseModel): percentile: float = Field( - description="A number between 0 and 1 (e.g. '90% of people are age 60 or younger' translates to '0.9')", + description="A number between 0 and 1 (e.g. '90% likelihood of AGI by 2040-01-01' translates to '0.9')", ) value: str = Field( - description="The number matching the percentile (e.g. '90% of people are age 60 or younger' translates to '60')", + description="The date matching the percentile (e.g. '90% likelihood of AGI by 2040-01-01' translates to '2040-01-01')", ) def is_valid_date(self, date_string: str): @@ -83,7 +83,7 @@ def is_valid_date(self, date_string: str): return False @model_validator(mode="after") - def validate_percentile(self: Percentile) -> Percentile: + def validate_percentile(self: DateStringPercentile) -> DateStringPercentile: if self.percentile < 0 or self.percentile > 1: raise ValueError( f"Percentile must be between 0 and 1, but was {self.percentile}" diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py index cb9935d..9b23924 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py @@ -4,6 +4,8 @@ from datetime import datetime, timezone from typing import Literal +import pendulum + from forecasting_tools.agents_and_tools.research.smart_searcher import SmartSearcher from forecasting_tools.ai_models.general_llm import GeneralLlm from forecasting_tools.data_models.binary_report import BinaryPrediction @@ -378,7 +380,7 @@ async def _date_prompt_to_forecast( f""" The text given to you is trying to give a forecast distribution for a date question. - This text is trying to answer the numeric question: "{question.question_text}". - - As an example, someone else guessed that the answer will be between {question.lower_bound} and {question.upper_bound}, so the numbers parsed from and answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". + - As an example, someone else guessed that the answer will be between {question.lower_bound} and {question.upper_bound}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". - The output is given as dates in the format of YYYY-MM-DD - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. """ @@ -394,34 +396,13 @@ async def _date_prompt_to_forecast( percentile_list = [ Percentile( percentile=percentile.percentile, - value=datetime.strptime( - f"{percentile.value} UTC", "%Y-%m-%d %Z" - ).timestamp(), + value=pendulum.parse(percentile.value).timestamp(), ) for percentile in date_percentile_list ] if double_check_extraction: - redundant_extraction = PredictionExtractor.extract_numeric_distribution_from_list_of_percentile_number_and_probability( - reasoning, question - ) - for redundant_percentile in redundant_extraction.declared_percentiles: - matching_original_percentile = next( - ( - percentile - for percentile in percentile_list - if abs(percentile.percentile - redundant_percentile.percentile) - < 0.001 - ), - None, - ) - assert ( - matching_original_percentile is not None - ), f"Matching original percentile not found for {redundant_percentile.percentile}" - assert ( - abs(redundant_percentile.value - matching_original_percentile.value) - < 0.001 - ), f"Redundant extraction {redundant_percentile.value} does not match original percentile {matching_original_percentile.value} for percentile {redundant_percentile.percentile}" + raise ValueError("Double check extraction not supported for date questions") prediction = NumericDistribution.from_question(percentile_list, question) logger.info( f"Forecasted URL {question.page_url} with prediction: {prediction.declared_percentiles}." @@ -629,7 +610,7 @@ async def _numeric_prompt_to_forecast( - When parsing the text, please make sure to give the values (the ones assigned to percentiles) in terms of the correct units. - The units for the forecast are: {question.unit_of_measure} - Your work will be shown publicly with these units stated verbatim after the numbers your parse. - - As an example, someone else guessed that the answer will be between {question.lower_bound} {question.unit_of_measure} and {question.upper_bound} {question.unit_of_measure}, so the numbers parsed from and answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". + - As an example, someone else guessed that the answer will be between {question.lower_bound} {question.unit_of_measure} and {question.upper_bound} {question.unit_of_measure}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". - If the answer doesn't give the answer in the correct units, you should parse it in the right units. For instance if the answer gives numbers as $500,000,000 and units are "B $" then you should parse the answer as 0.5 (since $500,000,000 is $0.5 billion). - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. - Turn any values that are in scientific notation into regular numbers. @@ -686,7 +667,7 @@ def _create_upper_and_lower_bound_messages( elif isinstance(question, DateQuestion): upper_bound_number = question.upper_bound.date().isoformat() lower_bound_number = question.lower_bound.date().isoformat() - unit_of_measure = None + unit_of_measure = "" else: raise ValueError() From 3562a73e7cd8484a26e94cd98dc987268de66128 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Tue, 16 Dec 2025 22:32:50 +0000 Subject: [PATCH 10/19] Added hourly parsing and readable date distribution --- forecasting_tools/__init__.py | 2 +- .../data_models/numeric_report.py | 55 +++++++++---------- .../official_bots/fall_template_bot.py | 14 ++--- 3 files changed, 35 insertions(+), 36 deletions(-) diff --git a/forecasting_tools/__init__.py b/forecasting_tools/__init__.py index ee2aed8..07e5295 100644 --- a/forecasting_tools/__init__.py +++ b/forecasting_tools/__init__.py @@ -101,7 +101,7 @@ PredictedOptionList as PredictedOptionList, ) from forecasting_tools.data_models.numeric_report import ( - DateStringPercentile as DateStringPercentile, + DatePercentile as DatePercentile, ) from forecasting_tools.data_models.numeric_report import ( DiscreteReport as DiscreteReport, diff --git a/forecasting_tools/data_models/numeric_report.py b/forecasting_tools/data_models/numeric_report.py index 371595d..4e51474 100644 --- a/forecasting_tools/data_models/numeric_report.py +++ b/forecasting_tools/data_models/numeric_report.py @@ -1,9 +1,8 @@ from __future__ import annotations -import datetime import logging -import re from collections import Counter +from datetime import datetime from typing import TYPE_CHECKING import numpy as np @@ -64,34 +63,22 @@ def validate_percentile(self: Percentile) -> Percentile: return self -class DateStringPercentile(BaseModel): +class DatePercentile(BaseModel): percentile: float = Field( description="A number between 0 and 1 (e.g. '90% likelihood of AGI by 2040-01-01' translates to '0.9')", ) - value: str = Field( + value: datetime = Field( description="The date matching the percentile (e.g. '90% likelihood of AGI by 2040-01-01' translates to '2040-01-01')", ) - def is_valid_date(self, date_string: str): - pattern = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$" - if re.match(pattern, date_string): - try: - datetime.datetime.strptime(date_string, "%Y-%m-%d") - return True - except ValueError: - return False - return False - @model_validator(mode="after") - def validate_percentile(self: DateStringPercentile) -> DateStringPercentile: + def validate_percentile(self: DatePercentile) -> DatePercentile: if self.percentile < 0 or self.percentile > 1: raise ValueError( f"Percentile must be between 0 and 1, but was {self.percentile}" ) if np.isnan(self.percentile): raise ValueError(f"Percentile must be a number, but was {self.percentile}") - if not self.is_valid_date(self.value): - raise ValueError(f"Date must be in YYYY-MM-DD format, but was {self.value}") return self @@ -107,6 +94,7 @@ class NumericDistribution(BaseModel): ) standardize_cdf: bool = True strict_validation: bool = True + is_date: bool = False @model_validator(mode="after") def validate_percentiles(self: NumericDistribution) -> NumericDistribution: @@ -268,34 +256,39 @@ def from_question( question: NumericQuestion | DateQuestion, standardize_cdf: bool | None = None, ) -> NumericDistribution: + from forecasting_tools.data_models.questions import DateQuestion + + is_date = isinstance(question, DateQuestion) - upper_bound = question.upper_bound - if isinstance(upper_bound, datetime.datetime): - upper_bound = upper_bound.timestamp() - lower_bound = question.lower_bound - if isinstance(lower_bound, datetime.datetime): - lower_bound = lower_bound.timestamp() + if is_date: + upper_bound_float: float = question.upper_bound.timestamp() + lower_bound_float: float = question.lower_bound.timestamp() + else: + upper_bound_float = question.upper_bound + lower_bound_float = question.lower_bound if standardize_cdf is None: return NumericDistribution( declared_percentiles=percentiles, open_upper_bound=question.open_upper_bound, open_lower_bound=question.open_lower_bound, - upper_bound=upper_bound, - lower_bound=lower_bound, + upper_bound=upper_bound_float, + lower_bound=lower_bound_float, zero_point=question.zero_point, cdf_size=question.cdf_size, + is_date=is_date, ) else: return NumericDistribution( declared_percentiles=percentiles, open_upper_bound=question.open_upper_bound, open_lower_bound=question.open_lower_bound, - upper_bound=upper_bound, - lower_bound=lower_bound, + upper_bound=upper_bound_float, + lower_bound=lower_bound_float, zero_point=question.zero_point, cdf_size=question.cdf_size, standardize_cdf=standardize_cdf, + is_date=is_date, ) def get_representative_percentiles( @@ -648,7 +641,13 @@ def make_readable_prediction(cls, prediction: NumericDistribution) -> str: ) readable = "Probability distribution:\n" for percentile in representative_percentiles: - readable += f"- {percentile.percentile:.2%} chance of value below {round(percentile.value,6)}\n" + if prediction.is_date: + formatted_value = datetime.fromtimestamp(percentile.value).strftime( + "%Y-%m-%d %H:%M:%S UTC" + ) + else: + formatted_value = str(round(percentile.value, 6)) + readable += f"- {percentile.percentile:.2%} chance of value below {formatted_value}\n" return readable async def publish_report_to_metaculus(self) -> None: diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py index 9b23924..eb5bd65 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py @@ -4,8 +4,6 @@ from datetime import datetime, timezone from typing import Literal -import pendulum - from forecasting_tools.agents_and_tools.research.smart_searcher import SmartSearcher from forecasting_tools.ai_models.general_llm import GeneralLlm from forecasting_tools.data_models.binary_report import BinaryPrediction @@ -17,7 +15,7 @@ from forecasting_tools.data_models.forecast_report import ReasonedPrediction from forecasting_tools.data_models.multiple_choice_report import PredictedOptionList from forecasting_tools.data_models.numeric_report import ( - DateStringPercentile, + DatePercentile, NumericDistribution, Percentile, ) @@ -364,6 +362,8 @@ async def _run_forecast_on_date( Percentile 80: YYYY-MM-DD Percentile 90: YYYY-MM-DD " + + If hours matter, please prepend the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ """ ) return await self._date_prompt_to_forecast(question, prompt) @@ -381,13 +381,13 @@ async def _date_prompt_to_forecast( The text given to you is trying to give a forecast distribution for a date question. - This text is trying to answer the numeric question: "{question.question_text}". - As an example, someone else guessed that the answer will be between {question.lower_bound} and {question.upper_bound}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". - - The output is given as dates in the format of YYYY-MM-DD + - The output is given as dates/times please format it into a valid datetime parsable string. Assume midnight UTC if no hour is given. - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. """ ) - date_percentile_list: list[DateStringPercentile] = await structure_output( + date_percentile_list: list[DatePercentile] = await structure_output( reasoning, - list[DateStringPercentile], + list[DatePercentile], model=self.get_llm("parser", "llm"), additional_instructions=parsing_instructions, num_validation_samples=self._structure_output_validation_samples, @@ -396,7 +396,7 @@ async def _date_prompt_to_forecast( percentile_list = [ Percentile( percentile=percentile.percentile, - value=pendulum.parse(percentile.value).timestamp(), + value=percentile.value.timestamp(), ) for percentile in date_percentile_list ] From 09f8303e3d0d44fd44dbec8d68ddb86c1ccbdc85 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Tue, 16 Dec 2025 23:04:46 +0000 Subject: [PATCH 11/19] Updated uniform probability bot and moved date forecast function --- .../forecast_bots/forecast_bot.py | 1 + .../official_bots/fall_template_bot.py | 201 +++++++++--------- .../official_bots/uniform_probability_bot.py | 41 ++++ 3 files changed, 143 insertions(+), 100 deletions(-) diff --git a/forecasting_tools/forecast_bots/forecast_bot.py b/forecasting_tools/forecast_bots/forecast_bot.py index defcd6f..9ea91d6 100644 --- a/forecasting_tools/forecast_bots/forecast_bot.py +++ b/forecasting_tools/forecast_bots/forecast_bot.py @@ -545,6 +545,7 @@ async def _run_forecast_on_multiple_choice( async def _run_forecast_on_date( self, question: DateQuestion, research: str ) -> ReasonedPrediction[NumericDistribution]: + # Return a numeric distribution of timestamps raise NotImplementedError("Subclass must implement this method") async def _run_forecast_on_conditional( diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py index eb5bd65..e4b2879 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py @@ -309,106 +309,6 @@ async def _run_forecast_on_binary( return await self._binary_prompt_to_forecast(question, prompt) - async def _run_forecast_on_date( - self, question: DateQuestion, research: str - ) -> ReasonedPrediction[NumericDistribution]: - upper_bound_message, lower_bound_message = ( - self._create_upper_and_lower_bound_messages(question) - ) - prompt = clean_indents( - f""" - You are a professional forecaster interviewing for a job. - - Your interview question is: - {question.question_text} - - Background: - {question.background_info} - - {question.resolution_criteria} - - {question.fine_print} - - Your research assistant says: - {research} - - Today is {datetime.now().strftime("%Y-%m-%d")}. - - {lower_bound_message} - {upper_bound_message} - - Formatting Instructions: - - This is a date question, and as such, the answer must be expressed in terms of dates. - - The dates must be written in the format of YYYY-MM-DD. No other format is acceptable. - - You will not add time information to the dates. - - Always start with a lower date chronologically and then increase from there. - - Before answering you write: - (a) The time left until the outcome to the question is known. - (b) The outcome if nothing changed. - (c) The outcome if the current trend continued. - (d) The expectations of experts and markets. - (e) A brief description of an unexpected scenario that results in a low outcome. - (f) A brief description of an unexpected scenario that results in a high outcome. - {self._get_conditional_disclaimer_if_necessary(question)} - You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. - - The last thing you write is your final answer as: - " - Percentile 10: YYYY-MM-DD - Percentile 20: YYYY-MM-DD - Percentile 40: YYYY-MM-DD - Percentile 60: YYYY-MM-DD - Percentile 80: YYYY-MM-DD - Percentile 90: YYYY-MM-DD - " - - If hours matter, please prepend the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ - """ - ) - return await self._date_prompt_to_forecast(question, prompt) - - async def _date_prompt_to_forecast( - self, - question: DateQuestion, - prompt: str, - double_check_extraction: bool = False, - ) -> ReasonedPrediction[NumericDistribution]: - reasoning = await self.get_llm("default", "llm").invoke(prompt) - logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") - parsing_instructions = clean_indents( - f""" - The text given to you is trying to give a forecast distribution for a date question. - - This text is trying to answer the numeric question: "{question.question_text}". - - As an example, someone else guessed that the answer will be between {question.lower_bound} and {question.upper_bound}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". - - The output is given as dates/times please format it into a valid datetime parsable string. Assume midnight UTC if no hour is given. - - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. - """ - ) - date_percentile_list: list[DatePercentile] = await structure_output( - reasoning, - list[DatePercentile], - model=self.get_llm("parser", "llm"), - additional_instructions=parsing_instructions, - num_validation_samples=self._structure_output_validation_samples, - ) - - percentile_list = [ - Percentile( - percentile=percentile.percentile, - value=percentile.value.timestamp(), - ) - for percentile in date_percentile_list - ] - - if double_check_extraction: - raise ValueError("Double check extraction not supported for date questions") - prediction = NumericDistribution.from_question(percentile_list, question) - logger.info( - f"Forecasted URL {question.page_url} with prediction: {prediction.declared_percentiles}." - ) - return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) - async def _binary_prompt_to_forecast( self, question: BinaryQuestion, @@ -651,6 +551,107 @@ async def _numeric_prompt_to_forecast( ) return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) + async def _run_forecast_on_date( + self, question: DateQuestion, research: str + ) -> ReasonedPrediction[NumericDistribution]: + upper_bound_message, lower_bound_message = ( + self._create_upper_and_lower_bound_messages(question) + ) + prompt = clean_indents( + f""" + You are a professional forecaster interviewing for a job. + + Your interview question is: + {question.question_text} + + Background: + {question.background_info} + + {question.resolution_criteria} + + {question.fine_print} + + Your research assistant says: + {research} + + Today is {datetime.now().strftime("%Y-%m-%d")}. + + {lower_bound_message} + {upper_bound_message} + + Formatting Instructions: + - This is a date question, and as such, the answer must be expressed in terms of dates. + - The dates must be written in the format of YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ. No other formatting is allowed. + - Always start with a lower date chronologically and then increase from there. + - Do NOT forget this. The dates must be written in chronological order starting at the earliest time at percentile 10 and increasing from there. + + Before answering you write: + (a) The time left until the outcome to the question is known. + (b) The outcome if nothing changed. + (c) The outcome if the current trend continued. + (d) The expectations of experts and markets. + (e) A brief description of an unexpected scenario that results in a low outcome. + (f) A brief description of an unexpected scenario that results in a high outcome. + {self._get_conditional_disclaimer_if_necessary(question)} + You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. + + The last thing you write is your final answer as: + " + Percentile 10: YYYY-MM-DD + Percentile 20: YYYY-MM-DD + Percentile 40: YYYY-MM-DD + Percentile 60: YYYY-MM-DD + Percentile 80: YYYY-MM-DD + Percentile 90: YYYY-MM-DD + " + + If hours matter, please prepend the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ + """ + ) + return await self._date_prompt_to_forecast(question, prompt) + + async def _date_prompt_to_forecast( + self, + question: DateQuestion, + prompt: str, + double_check_extraction: bool = False, + ) -> ReasonedPrediction[NumericDistribution]: + reasoning = await self.get_llm("default", "llm").invoke(prompt) + logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") + parsing_instructions = clean_indents( + f""" + The text given to you is trying to give a forecast distribution for a date question. + - This text is trying to answer the numeric question: "{question.question_text}". + - As an example, someone else guessed that the answer will be between {question.lower_bound} and {question.upper_bound}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". + - The output is given as dates/times please format it into a valid datetime parsable string. Assume midnight UTC if no hour is given. + - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. + """ + ) + date_percentile_list: list[DatePercentile] = await structure_output( + reasoning, + list[DatePercentile], + model=self.get_llm("parser", "llm"), + additional_instructions=parsing_instructions, + num_validation_samples=self._structure_output_validation_samples, + ) + + percentile_list = [ + Percentile( + percentile=percentile.percentile, + value=percentile.value.timestamp(), + ) + for percentile in date_percentile_list + ] + + if double_check_extraction: + raise ValueError("Double check extraction not supported for date questions") + + prediction = NumericDistribution.from_question(percentile_list, question) + logger.info( + f"Forecasted URL {question.page_url} with prediction: {prediction.declared_percentiles}." + ) + return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) + def _create_upper_and_lower_bound_messages( self, question: NumericQuestion | DateQuestion ) -> tuple[str, str]: diff --git a/forecasting_tools/forecast_bots/official_bots/uniform_probability_bot.py b/forecasting_tools/forecast_bots/official_bots/uniform_probability_bot.py index f727ac8..7622b1c 100644 --- a/forecasting_tools/forecast_bots/official_bots/uniform_probability_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/uniform_probability_bot.py @@ -8,6 +8,7 @@ from forecasting_tools.data_models.numeric_report import NumericDistribution, Percentile from forecasting_tools.data_models.questions import ( BinaryQuestion, + DateQuestion, MetaculusQuestion, MultipleChoiceQuestion, NumericQuestion, @@ -89,6 +90,46 @@ async def _run_forecast_on_numeric( ), ) + async def _run_forecast_on_date( + self, question: DateQuestion, research: str + ) -> ReasonedPrediction[NumericDistribution]: + lower_bound_timestamp = question.lower_bound.timestamp() + upper_bound_timestamp = question.upper_bound.timestamp() + distribution_range = upper_bound_timestamp - lower_bound_timestamp + + percentiles = [ + Percentile( + value=lower_bound_timestamp + 0.1 * distribution_range, + percentile=0.1, + ), + Percentile( + value=lower_bound_timestamp + 0.3 * distribution_range, + percentile=0.3, + ), + Percentile( + value=lower_bound_timestamp + 0.5 * distribution_range, + percentile=0.5, + ), + Percentile( + value=lower_bound_timestamp + 0.7 * distribution_range, + percentile=0.7, + ), + Percentile( + value=lower_bound_timestamp + 0.9 * distribution_range, + percentile=0.9, + ), + ] + + distribution = NumericDistribution.from_question(percentiles, question) + + return ReasonedPrediction( + prediction_value=distribution, + reasoning=( + "Created a uniform distribution between the lower and upper date bounds. " + "NOTE: The cdf will have sloping probability at the edges if the bounds are open" + ), + ) + async def summarize_research( self, question: MetaculusQuestion, research: str ) -> str: From b6b17667720984e6169e33fea8f3912fee3a597a Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Tue, 16 Dec 2025 23:28:45 +0000 Subject: [PATCH 12/19] reorganized functions --- .../official_bots/fall_template_bot.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py index e4b2879..ae0b5f6 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py @@ -118,18 +118,6 @@ class FallTemplateBot2025(ForecastBot): _concurrency_limiter = asyncio.Semaphore(_max_concurrent_questions) _structure_output_validation_samples = 2 - def _get_conditional_disclaimer_if_necessary( - self, question: MetaculusQuestion - ) -> str: - if question.conditional_type not in ["yes", "no"]: - return "" - return clean_indents( - """ - As you are given a conditional question with a parent and child, you are to only forecast the **CHILD** question, given the parent question's resolution. - You never re-forecast the parent question under any circumstances, but you use probabilistic reasoning, strongly considering the parent question's resolution, to forecast the child question. - """ - ) - async def run_research(self, question: MetaculusQuestion) -> str: async with self._concurrency_limiter: research = "" @@ -368,6 +356,7 @@ async def _run_forecast_on_multiple_choice( (a) The time left until the outcome to the question is known. (b) The status quo outcome if nothing changed. (c) A description of an scenario that results in an unexpected outcome. + {self._get_conditional_disclaimer_if_necessary(question)} You write your rationale remembering that (1) good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time, and (2) good forecasters leave some moderate probability on most options to account for unexpected outcomes. @@ -479,6 +468,7 @@ async def _run_forecast_on_numeric( (d) The expectations of experts and markets. (e) A brief description of an unexpected scenario that results in a low outcome. (f) A brief description of an unexpected scenario that results in a high outcome. + {self._get_conditional_disclaimer_if_necessary(question)} You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. @@ -592,6 +582,7 @@ async def _run_forecast_on_date( (d) The expectations of experts and markets. (e) A brief description of an unexpected scenario that results in a low outcome. (f) A brief description of an unexpected scenario that results in a high outcome. + {self._get_conditional_disclaimer_if_necessary(question)} You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. @@ -608,7 +599,8 @@ async def _run_forecast_on_date( If hours matter, please prepend the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ """ ) - return await self._date_prompt_to_forecast(question, prompt) + forecast = await self._date_prompt_to_forecast(question, prompt) + return forecast async def _date_prompt_to_forecast( self, @@ -683,6 +675,18 @@ def _create_upper_and_lower_bound_messages( lower_bound_message = f"The outcome can not be lower than {lower_bound_number} {unit_of_measure}." return upper_bound_message, lower_bound_message + def _get_conditional_disclaimer_if_necessary( + self, question: MetaculusQuestion + ) -> str: + if question.conditional_type not in ["yes", "no"]: + return "" + return clean_indents( + """ + As you are given a conditional question with a parent and child, you are to only forecast the **CHILD** question, given the parent question's resolution. + You never re-forecast the parent question under any circumstances, but you use probabilistic reasoning, strongly considering the parent question's resolution, to forecast the child question. + """ + ) + if __name__ == "__main__": logging.basicConfig( From d4b55475f4d05649b08b2ed768a2aaa23e8a09b3 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Tue, 16 Dec 2025 23:52:12 +0000 Subject: [PATCH 13/19] Date questions pulled in via run_on_tournament --- forecasting_tools/__init__.py | 2 ++ forecasting_tools/data_models/data_organizer.py | 7 +++++-- forecasting_tools/data_models/numeric_report.py | 7 ++++++- forecasting_tools/forecast_bots/forecast_bot.py | 16 ---------------- 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/forecasting_tools/__init__.py b/forecasting_tools/__init__.py index 07e5295..92b7393 100644 --- a/forecasting_tools/__init__.py +++ b/forecasting_tools/__init__.py @@ -103,6 +103,7 @@ from forecasting_tools.data_models.numeric_report import ( DatePercentile as DatePercentile, ) +from forecasting_tools.data_models.numeric_report import DateReport as DateReport from forecasting_tools.data_models.numeric_report import ( DiscreteReport as DiscreteReport, ) @@ -128,6 +129,7 @@ ForecastReport.model_rebuild() NumericReport.model_rebuild() DiscreteReport.model_rebuild() +DateReport.model_rebuild() from forecasting_tools.data_models.questions import QuestionState as QuestionState from forecasting_tools.forecast_bots.forecast_bot import ForecastBot as ForecastBot from forecasting_tools.forecast_bots.forecast_bot import Notepad as Notepad diff --git a/forecasting_tools/data_models/data_organizer.py b/forecasting_tools/data_models/data_organizer.py index 2193520..dd26091 100644 --- a/forecasting_tools/data_models/data_organizer.py +++ b/forecasting_tools/data_models/data_organizer.py @@ -13,6 +13,7 @@ PredictedOptionList, ) from forecasting_tools.data_models.numeric_report import ( + DateReport, DiscreteReport, NumericDistribution, NumericReport, @@ -46,7 +47,9 @@ class TypeMapping(BaseModel): | DiscreteQuestion | ConditionalQuestion ) -ReportTypes = NumericReport | MultipleChoiceReport | BinaryReport | DiscreteReport +ReportTypes = ( + NumericReport | MultipleChoiceReport | BinaryReport | DiscreteReport | DateReport +) class DataOrganizer: @@ -64,7 +67,7 @@ class DataOrganizer: TypeMapping( question_type=DateQuestion, test_post_id=4110, # https://www.metaculus.com/questions/4110/birthdate-of-oldest-living-human-in-2200/ - report_type=NumericReport, + report_type=DateReport, ), TypeMapping( question_type=MultipleChoiceQuestion, diff --git a/forecasting_tools/data_models/numeric_report.py b/forecasting_tools/data_models/numeric_report.py index 4e51474..bec75ff 100644 --- a/forecasting_tools/data_models/numeric_report.py +++ b/forecasting_tools/data_models/numeric_report.py @@ -592,7 +592,7 @@ def _cdf_location_to_nominal_location(self, cdf_location: float) -> float: class NumericReport(ForecastReport): - question: NumericQuestion | DateQuestion + question: NumericQuestion prediction: NumericDistribution @classmethod @@ -682,3 +682,8 @@ async def publish_report_to_metaculus(self) -> None: class DiscreteReport(NumericReport): question: DiscreteQuestion prediction: NumericDistribution + + +class DateReport(NumericReport): + question: DateQuestion + prediction: NumericDistribution diff --git a/forecasting_tools/forecast_bots/forecast_bot.py b/forecasting_tools/forecast_bots/forecast_bot.py index 9ea91d6..c0cc42f 100644 --- a/forecasting_tools/forecast_bots/forecast_bot.py +++ b/forecasting_tools/forecast_bots/forecast_bot.py @@ -163,22 +163,6 @@ async def forecast_on_tournament( return_exceptions: bool = False, ) -> list[ForecastReport] | list[ForecastReport | BaseException]: questions = MetaculusApi.get_all_open_questions_from_tournament(tournament_id) - supported_question_types = [ - NumericQuestion, - MultipleChoiceQuestion, - BinaryQuestion, - ConditionalQuestion, - ] - supported_questions = [ - question - for question in questions - if isinstance(question, tuple(supported_question_types)) - ] - if len(supported_questions) != len(questions): - logger.warning( - f"Skipping {len(questions) - len(supported_questions)} questions that are not supported (probably date questions)" - ) - questions = supported_questions return await self.forecast_questions(questions, return_exceptions) @overload From d3349c476dff3bcca56c9621a52098ca9c358874 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Wed, 17 Dec 2025 02:37:31 +0000 Subject: [PATCH 14/19] Added date support for researchonly bot --- .../test_data_models/test_forecast_report.py | 2 + .../official_bots/fall_research_only_bot.py | 55 +++++++++++++++++++ forecasting_tools/helpers/asknews_searcher.py | 8 +-- 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/code_tests/unit_tests/test_data_models/test_forecast_report.py b/code_tests/unit_tests/test_data_models/test_forecast_report.py index 4cc2a12..22f2b64 100644 --- a/code_tests/unit_tests/test_data_models/test_forecast_report.py +++ b/code_tests/unit_tests/test_data_models/test_forecast_report.py @@ -15,6 +15,8 @@ def test_metaculus_report_is_jsonable() -> None: temp_writing_path = "temp/temp_metaculus_report.json" read_report_path = "code_tests/unit_tests/test_data_models/forecasting_test_data/metaculus_forecast_report_examples.json" + # TODO: Add examples for conditional and date reports (and discrete reports?) + reports = DataOrganizer.load_reports_from_file_path(read_report_path) assert any(isinstance(report, NumericReport) for report in reports) assert any(isinstance(report, BinaryReport) for report in reports) diff --git a/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py index 0b2bfeb..026ae5f 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py @@ -8,6 +8,7 @@ from forecasting_tools.data_models.numeric_report import NumericDistribution from forecasting_tools.data_models.questions import ( BinaryQuestion, + DateQuestion, MetaculusQuestion, MultipleChoiceQuestion, NumericQuestion, @@ -172,3 +173,57 @@ async def _run_forecast_on_numeric( ) async with self._concurrency_limiter: return await self._numeric_prompt_to_forecast(question, prompt) + + async def _run_forecast_on_date( + self, question: DateQuestion, research: str + ) -> ReasonedPrediction[NumericDistribution]: + upper_bound_message, lower_bound_message = ( + self._create_upper_and_lower_bound_messages(question) + ) + prompt = clean_indents( + f""" + You are a professional forecaster interviewing for a job. + + Your interview question is: + {question.question_text} + + Background: + {question.background_info} + + {question.resolution_criteria} + + {question.fine_print} + + Your research assistant says: + {research} + + Today is {datetime.now().strftime("%Y-%m-%d")}. + + {lower_bound_message} + {upper_bound_message} + + Formatting Instructions: + - This is a date question, and as such, the answer must be expressed in terms of dates. + - The dates must be written in the format of YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ. No other formatting is allowed. + - Always start with a lower date chronologically and then increase from there. + - Do NOT forget this. The dates must be written in chronological order starting at the earliest time at percentile 10 and increasing from there. + + {self._instructions} + + You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. + + The last thing you write is your final answer as: + " + Percentile 10: YYYY-MM-DD + Percentile 20: YYYY-MM-DD + Percentile 40: YYYY-MM-DD + Percentile 60: YYYY-MM-DD + Percentile 80: YYYY-MM-DD + Percentile 90: YYYY-MM-DD + " + + If hours matter, please prepend the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ + """ + ) + async with self._concurrency_limiter: + return await self._date_prompt_to_forecast(question, prompt) diff --git a/forecasting_tools/helpers/asknews_searcher.py b/forecasting_tools/helpers/asknews_searcher.py index bb8f8a5..8cccedc 100644 --- a/forecasting_tools/helpers/asknews_searcher.py +++ b/forecasting_tools/helpers/asknews_searcher.py @@ -2,7 +2,7 @@ import asyncio import os -from typing import Literal, Optional +from typing import Literal from asknews_sdk import AsyncAskNewsSDK @@ -31,9 +31,9 @@ class AskNewsSearcher: def __init__( self, - client_id: Optional[str] = None, - client_secret: Optional[str] = None, - api_key: Optional[str] = None, + client_id: str | None = None, + client_secret: str | None = None, + api_key: str | None = None, ) -> None: self.client_id = client_id or os.getenv("ASKNEWS_CLIENT_ID") self.client_secret = client_secret or os.getenv("ASKNEWS_SECRET") From 6012ba91bce1422627883ace9f3b101da0b76487 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Wed, 17 Dec 2025 02:45:45 +0000 Subject: [PATCH 15/19] Fixed tests --- .../integration_tests/test_forecast_bots_live.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/code_tests/integration_tests/test_forecast_bots_live.py b/code_tests/integration_tests/test_forecast_bots_live.py index 42ce000..dd90938 100644 --- a/code_tests/integration_tests/test_forecast_bots_live.py +++ b/code_tests/integration_tests/test_forecast_bots_live.py @@ -89,7 +89,7 @@ async def test_predicts_ai_2027_tournament(bot: ForecastBot) -> None: reports = await bot.forecast_on_tournament("ai-2027") bot.log_report_summary(reports) - assert len(reports) == 15, "Expected 19 reports" + assert len(reports) == 19, "Expected 19 reports" except Exception as e: pytest.fail(f"Forecasting on ai-2027 tournament failed: {e}") @@ -204,13 +204,8 @@ async def test_collects_reports_on_open_questions(mocker: Mock) -> None: questions_that_should_be_being_forecast_on = ( MetaculusApi.get_all_open_questions_from_tournament(tournament_id) ) - date_questions = [ - question - for question in questions_that_should_be_being_forecast_on - if isinstance(question, DateQuestion) - ] - assert len(reports) == len(questions_that_should_be_being_forecast_on) - len( - date_questions + assert len(reports) == len( + questions_that_should_be_being_forecast_on ), "Not all questions were forecasted on" From f4c05cc1ab2e97634608680bc88dbe5b6915e9e7 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Wed, 17 Dec 2025 02:54:34 +0000 Subject: [PATCH 16/19] Removed a redunant date question test --- .../test_forecast_bots_live.py | 21 +------------------ .../official_bots/fall_template_bot.py | 4 ++-- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/code_tests/integration_tests/test_forecast_bots_live.py b/code_tests/integration_tests/test_forecast_bots_live.py index dd90938..a4a714a 100644 --- a/code_tests/integration_tests/test_forecast_bots_live.py +++ b/code_tests/integration_tests/test_forecast_bots_live.py @@ -7,7 +7,7 @@ import typeguard from code_tests.unit_tests.forecasting_test_manager import ForecastingTestManager -from forecasting_tools import MetaculusClient, NumericDistribution +from forecasting_tools import MetaculusClient from forecasting_tools.ai_models.general_llm import GeneralLlm from forecasting_tools.ai_models.resource_managers.monetary_cost_manager import ( MonetaryCostManager, @@ -17,7 +17,6 @@ from forecasting_tools.data_models.data_organizer import DataOrganizer from forecasting_tools.data_models.questions import ( ConditionalQuestion, - DateQuestion, MetaculusQuestion, ) from forecasting_tools.data_models.timestamped_predictions import ( @@ -113,24 +112,6 @@ async def test_taiwan_tournament_uniform_probability_bot() -> None: ), "Expected some conditional reports" -async def test_date_question() -> None: - bot = TemplateBot( - publish_reports_to_metaculus=True, - skip_previously_forecasted_questions=False, - llms={ - "default": GeneralLlm(model="openai/o4-mini", temperature=1), - "summarizer": GeneralLlm(model="openai/o4-mini", temperature=1), - "researcher": GeneralLlm(model="openai/o4-mini", temperature=1), - "parser": GeneralLlm(model="openai/o4-mini", temperature=1), - }, - ) - url = "https://www.metaculus.com/questions/7104/birthdate-of-the-first-human-to-live-to-1000/" - question = MetaculusClient().get_question_by_url(url) - assert isinstance(question, DateQuestion) - report = await bot.forecast_question(question) - assert isinstance(report.prediction, NumericDistribution) - - async def test_conditional_forecasts() -> None: bot = TemplateBot( publish_reports_to_metaculus=True, diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py index ae0b5f6..2e5cb1b 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py @@ -596,7 +596,7 @@ async def _run_forecast_on_date( Percentile 90: YYYY-MM-DD " - If hours matter, please prepend the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ + If hours matter, please append the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ """ ) forecast = await self._date_prompt_to_forecast(question, prompt) @@ -613,7 +613,7 @@ async def _date_prompt_to_forecast( parsing_instructions = clean_indents( f""" The text given to you is trying to give a forecast distribution for a date question. - - This text is trying to answer the numeric question: "{question.question_text}". + - This text is trying to answer the question: "{question.question_text}". - As an example, someone else guessed that the answer will be between {question.lower_bound} and {question.upper_bound}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". - The output is given as dates/times please format it into a valid datetime parsable string. Assume midnight UTC if no hour is given. - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. From 425862aaf98608c01f7f223af97811ea670eeb64 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Wed, 17 Dec 2025 03:08:15 +0000 Subject: [PATCH 17/19] Incorporated final AI code review --- forecasting_tools/data_models/numeric_report.py | 8 ++++---- forecasting_tools/data_models/questions.py | 1 + forecasting_tools/forecast_bots/forecast_bot.py | 1 + .../forecast_bots/official_bots/fall_research_only_bot.py | 4 +--- .../forecast_bots/official_bots/fall_template_bot.py | 4 +--- 5 files changed, 8 insertions(+), 10 deletions(-) diff --git a/forecasting_tools/data_models/numeric_report.py b/forecasting_tools/data_models/numeric_report.py index bec75ff..1ef7d87 100644 --- a/forecasting_tools/data_models/numeric_report.py +++ b/forecasting_tools/data_models/numeric_report.py @@ -2,7 +2,7 @@ import logging from collections import Counter -from datetime import datetime +from datetime import datetime, timezone from typing import TYPE_CHECKING import numpy as np @@ -642,9 +642,9 @@ def make_readable_prediction(cls, prediction: NumericDistribution) -> str: readable = "Probability distribution:\n" for percentile in representative_percentiles: if prediction.is_date: - formatted_value = datetime.fromtimestamp(percentile.value).strftime( - "%Y-%m-%d %H:%M:%S UTC" - ) + formatted_value = datetime.fromtimestamp( + percentile.value, tz=timezone.utc + ).strftime("%Y-%m-%d %H:%M:%S UTC") else: formatted_value = str(round(percentile.value, 6)) readable += f"- {percentile.percentile:.2%} chance of value below {formatted_value}\n" diff --git a/forecasting_tools/data_models/questions.py b/forecasting_tools/data_models/questions.py index e33aeb4..757e5d2 100644 --- a/forecasting_tools/data_models/questions.py +++ b/forecasting_tools/data_models/questions.py @@ -485,6 +485,7 @@ def from_metaculus_api_json(cls, api_json: dict) -> DateQuestion: open_upper_bound=open_upper_bound, open_lower_bound=open_lower_bound, zero_point=zero_point, + cdf_size=cls._get_cdf_size_from_json(api_json), **normal_metaculus_question.model_dump(), ) diff --git a/forecasting_tools/forecast_bots/forecast_bot.py b/forecasting_tools/forecast_bots/forecast_bot.py index c0cc42f..6f3d816 100644 --- a/forecasting_tools/forecast_bots/forecast_bot.py +++ b/forecasting_tools/forecast_bots/forecast_bot.py @@ -526,6 +526,7 @@ async def _run_forecast_on_multiple_choice( ) -> ReasonedPrediction[PredictedOptionList]: raise NotImplementedError("Subclass must implement this method") + @abstractmethod async def _run_forecast_on_date( self, question: DateQuestion, research: str ) -> ReasonedPrediction[NumericDistribution]: diff --git a/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py index 026ae5f..152075f 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py @@ -204,7 +204,7 @@ async def _run_forecast_on_date( Formatting Instructions: - This is a date question, and as such, the answer must be expressed in terms of dates. - - The dates must be written in the format of YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ. No other formatting is allowed. + - The dates must be written in the format of YYYY-MM-DD. If hours matter, please append the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ. No other formatting is allowed. - Always start with a lower date chronologically and then increase from there. - Do NOT forget this. The dates must be written in chronological order starting at the earliest time at percentile 10 and increasing from there. @@ -221,8 +221,6 @@ async def _run_forecast_on_date( Percentile 80: YYYY-MM-DD Percentile 90: YYYY-MM-DD " - - If hours matter, please prepend the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ """ ) async with self._concurrency_limiter: diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py index 2e5cb1b..d9bc9b5 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py @@ -571,7 +571,7 @@ async def _run_forecast_on_date( Formatting Instructions: - This is a date question, and as such, the answer must be expressed in terms of dates. - - The dates must be written in the format of YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ. No other formatting is allowed. + - The dates must be written in the format of YYYY-MM-DD. If hours matter, please append the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ.No other formatting is allowed. - Always start with a lower date chronologically and then increase from there. - Do NOT forget this. The dates must be written in chronological order starting at the earliest time at percentile 10 and increasing from there. @@ -595,8 +595,6 @@ async def _run_forecast_on_date( Percentile 80: YYYY-MM-DD Percentile 90: YYYY-MM-DD " - - If hours matter, please append the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ """ ) forecast = await self._date_prompt_to_forecast(question, prompt) From 9383935d6dafbaf4717586e0f0bfbac24d24f27b Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Wed, 17 Dec 2025 03:13:07 +0000 Subject: [PATCH 18/19] Removed abstractclass --- forecasting_tools/forecast_bots/forecast_bot.py | 1 - 1 file changed, 1 deletion(-) diff --git a/forecasting_tools/forecast_bots/forecast_bot.py b/forecasting_tools/forecast_bots/forecast_bot.py index 6f3d816..c0cc42f 100644 --- a/forecasting_tools/forecast_bots/forecast_bot.py +++ b/forecasting_tools/forecast_bots/forecast_bot.py @@ -526,7 +526,6 @@ async def _run_forecast_on_multiple_choice( ) -> ReasonedPrediction[PredictedOptionList]: raise NotImplementedError("Subclass must implement this method") - @abstractmethod async def _run_forecast_on_date( self, question: DateQuestion, research: str ) -> ReasonedPrediction[NumericDistribution]: From 52a210401758f85d006200431a4a37015bb22547 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Wed, 17 Dec 2025 03:28:42 +0000 Subject: [PATCH 19/19] Slight update in prompting - handle non chronological --- .../forecast_bots/official_bots/fall_research_only_bot.py | 4 ++-- .../forecast_bots/official_bots/fall_template_bot.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py index 152075f..0ce2dad 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py @@ -214,12 +214,12 @@ async def _run_forecast_on_date( The last thing you write is your final answer as: " - Percentile 10: YYYY-MM-DD + Percentile 10: YYYY-MM-DD (oldest date) Percentile 20: YYYY-MM-DD Percentile 40: YYYY-MM-DD Percentile 60: YYYY-MM-DD Percentile 80: YYYY-MM-DD - Percentile 90: YYYY-MM-DD + Percentile 90: YYYY-MM-DD (newest date) " """ ) diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py index d9bc9b5..ff552eb 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py @@ -588,12 +588,12 @@ async def _run_forecast_on_date( The last thing you write is your final answer as: " - Percentile 10: YYYY-MM-DD + Percentile 10: YYYY-MM-DD (oldest date) Percentile 20: YYYY-MM-DD Percentile 40: YYYY-MM-DD Percentile 60: YYYY-MM-DD Percentile 80: YYYY-MM-DD - Percentile 90: YYYY-MM-DD + Percentile 90: YYYY-MM-DD (newest date) " """ )