diff --git a/environments/ether0/README.md b/environments/ether0/README.md new file mode 100644 index 000000000..1a7774866 --- /dev/null +++ b/environments/ether0/README.md @@ -0,0 +1,41 @@ +# ether0 benchmark environment + +[Benchmark](https://huggingface.co/datasets/futurehouse/ether0-benchmark) and [paper](https://arxiv.org/pdf/2506.17238). + +325 chemistry reasoning questions across 14 task types. All answers are a molecule. Around 25 questions per task, including: + +- Completing SMILES fragments +- Designing molecules adhering to molecular formula and functional group constraints +- Predicting reaction outcomes +- Proposing one-step synthesis pathways +- Editing the solubility of a molecule +- Converting IUPAC name to SMILES +- Answering multiple-choice questions about safety, ADME properties, BBB permeability, toxicity, scent, and pKa + +Note that retro-synthesis and oracle-solubility require an additional verifier server (see `ether0-serve` in the [ether0 repo](https://github.com/Future-House/ether0/)). + +## Quickstart + +Create `env.yaml`: +``` +policy_base_url: http://localhost:8000/v1 +policy_api_key: EMPTY +policy_model_name: futurehouse/ether0 +``` + +Start servers and collect rollouts +```bash +# start vllm and nemo gym servers +vllm serve futurehouse/ether0 & +ng_run "+config_paths=[environments/ether0/config.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]" & + +# wait for above to be ready +ng_collect_rollouts \ + +agent_name=ether0_simple_agent \ + +input_jsonl_fpath=environments/ether0/data/example.jsonl \ + +output_jsonl_fpath=environments/ether0/data/ether0_rollouts.jsonl + +tail -n 1 environments/ether0/data/ether0_rollouts.jsonl | jq | less +``` + +See `prepare.py` to prepare the full dataset. \ No newline at end of file diff --git a/environments/ether0/__init__.py b/environments/ether0/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/environments/ether0/config.yaml b/environments/ether0/config.yaml new file mode 100644 index 000000000..55b274f98 --- /dev/null +++ b/environments/ether0/config.yaml @@ -0,0 +1,26 @@ +ether0: + resources_servers: + ether0: + entrypoint: app.py + domain: knowledge + verified: false + description: ether0 chemistry benchmark verifiers + value: Evalutate chemistry knowledge and reasoning with ether0 benchmark +ether0_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: ether0 + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: example + type: example + jsonl_fpath: environments/ether0/data/example.jsonl + - name: val + type: validation + jsonl_fpath: environments/ether0/data/val.jsonl + license: Creative Commons Attribution 4.0 International diff --git a/environments/ether0/data/.gitignore b/environments/ether0/data/.gitignore new file mode 100644 index 000000000..cf42462ab --- /dev/null +++ b/environments/ether0/data/.gitignore @@ -0,0 +1,6 @@ +*train.jsonl +*validation.jsonl +*val.jsonl +*train_prepare.jsonl +*validation_prepare.jsonl +*example_prepare.jsonl diff --git a/environments/ether0/data/example.jsonl b/environments/ether0/data/example.jsonl new file mode 100644 index 000000000..ce73fc076 --- /dev/null +++ b/environments/ether0/data/example.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "system", "content": "You are a scientific reasoning agent. Think step by step, then place your final answer inside tags. For example: CCO"}, {"role": "user", "content": "Generate a SMILES representation for a molecule containing groups: charged and nitro. It should also have formula C13H12N6O5."}]}, "verifier_metadata": {"solution": "functional_group_eval!:!('C13H12N6O5', ['charged', 'nitro'])!:!functional-group", "problem_type": "functional-group", "ideal": "Cc1ncc([N+](=O)[O-])n1CC(=O)N/N=C/c1ccc([N+](=O)[O-])cc1", "id": "00c8bc2d-0bb3-53c2-8bdf-cd19616d4536"}, "agent_ref": {"type": "responses_api_agents", "name": "ether0_simple_agent"}} +{"responses_create_params": {"input": [{"role": "system", "content": "You are a scientific reasoning agent. Think step by step, then place your final answer inside tags. For example: CCO"}, {"role": "user", "content": "Among the following, which molecule is predicted to have a permeability in MDCK cells in MDR1-MDCK efflux ratio B-A/A-B close to 1.04?\nFC(C1=NC(C(NC2C(OCC)=CC3=NN(CCC(O)(C)C)C=C3C=2)=O)=CC=C1)F\nC(NC1=CC2=CN(N=C2C=C1C(C)(O)C)CCC(C)(O)C)(=O)C1N=C(C=CC=1)C(F)(F)F\nC12C=C(NC(=O)C3C=CN(C(F)F)N=3)C(OC)=CC1=NN(C=2)CCC(C)(C)O"}]}, "verifier_metadata": {"solution": "str_eval!:!FC(C1=NC(C(NC2C(OCC)=CC3=NN(CCC(O)(C)C)C=C3C=2)=O)=CC=C1)F!:!property-regression-adme/log_mdr1-mdck_er", "problem_type": "property-regression-adme/log_mdr1-mdck_er", "ideal": "CCOc1cc2nn(CCC(C)(C)O)cc2cc1NC(=O)c1cccc(C(F)F)n1", "id": "066b28c7-c991-5095-8045-a5da176c150a"}, "agent_ref": {"type": "responses_api_agents", "name": "ether0_simple_agent"}} +{"responses_create_params": {"input": [{"role": "system", "content": "You are a scientific reasoning agent. Think step by step, then place your final answer inside tags. For example: CCO"}, {"role": "user", "content": "A compound with formula C30H44O7 was isolated from Nerium oleander L.. What is a plausible SMILES for it given this organism?\nspecies: Nerium oleander L.\ntaxonomicGroup: Angiosperms\nhabitat: Temperate and subtropical areas, along river banks, stream beds in river valleys, roadsides, parks, coastal gardens\nlifestyle: Free-living\nmetabolicType: Photoautotrophic\ncellularOrganization: Multicellular\npresenceOfOrganelles: Mitochondria, chloroplasts\ncellWallComposition: Cellulose"}]}, "verifier_metadata": {"solution": "formula_eval!:!CO[C@@H]1C[C@H](O[C@H]2CC[C@@]3(C)[C@H](CC[C@]45CC[C@H](C6=CC(=O)OC6)[C@@](C)(CC[C@H]34)C5=O)C2)O[C@H](C)[C@@H]1O!:!molecule-formula", "problem_type": "molecule-formula", "ideal": "CO[C@@H]1C[C@H](O[C@H]2CC[C@@]3(C)[C@H](CC[C@]45CC[C@H](C6=CC(=O)OC6)[C@@](C)(CC[C@H]34)C5=O)C2)O[C@H](C)[C@@H]1O", "id": "a0e5657a-901a-5888-af3b-87c8c8471ea8"}, "agent_ref": {"type": "responses_api_agents", "name": "ether0_simple_agent"}} +{"responses_create_params": {"input": [{"role": "system", "content": "You are a scientific reasoning agent. Think step by step, then place your final answer inside tags. For example: CCO"}, {"role": "user", "content": "Identify which of the following molecules will most likely have a rat LD50 oral in mg/kg of 6.09:\nC(CS)(=O)O.C(O)CN\nFCC(=O)O\nClCC(=O)O"}]}, "verifier_metadata": {"solution": "str_eval!:!FCC(=O)O!:!property-regression-ld50", "problem_type": "property-regression-ld50", "ideal": "FCC(=O)O", "id": "6af247d8-aaec-5047-8b57-af1b42f9d38a"}, "agent_ref": {"type": "responses_api_agents", "name": "ether0_simple_agent"}} +{"responses_create_params": {"input": [{"role": "system", "content": "You are a scientific reasoning agent. Think step by step, then place your final answer inside tags. For example: CCO"}, {"role": "user", "content": "Given that molecule [N+](=O)([O-])C1=CC=CC2=CC=CC=C12 is toxic, select from below the molecule most expected to not have this characteristic:\n[N+](=O)([O-])C1=C(C)C(=CC=C1)[N+](=O)[O-]\n[N+](=O)([O-])C1=CC=CC2=N[Se]N=C21\nIC1=C(C=CC=C1[N+](=O)[O-])[N+](=O)[O-]\nBrC=1C=C(C2=CC=CC=C2C1)[N+](=O)[O-]"}]}, "verifier_metadata": {"solution": "str_eval!:!BrC=1C=C(C2=CC=CC=C2C1)[N+](=O)[O-]!:!property-cat-safety/delta-toxic", "problem_type": "property-cat-safety/delta-toxic", "ideal": "BrC=1C=C(C2=CC=CC=C2C1)[N+](=O)[O-]", "id": "ea5a9ab5-7207-5016-8465-4634f7db5437"}, "agent_ref": {"type": "responses_api_agents", "name": "ether0_simple_agent"}} diff --git a/environments/ether0/data/example_metrics.json b/environments/ether0/data/example_metrics.json new file mode 100644 index 000000000..396ca58a9 --- /dev/null +++ b/environments/ether0/data/example_metrics.json @@ -0,0 +1,38 @@ +{ + "name": "example", + "type": "example", + "jsonl_fpath": "resources_servers/ether0/data/example.jsonl", + "num_repeats": 1, + "gitlab_identifier": null, + "huggingface_identifier": null, + "license": null, + "Number of examples": 5, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 5, + "Average": 52.6, + "Min": 46.0, + "Max": 75.0, + "Standard deviation": 12.64 + }, + "Number of turns": { + "Total # non-null values": 5, + "Average": 1.0, + "Min": 1.0, + "Max": 1.0, + "Standard deviation": 0.0 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Standard deviation": 0.0 + } +} \ No newline at end of file diff --git a/environments/ether0/data/example_rollouts.jsonl b/environments/ether0/data/example_rollouts.jsonl new file mode 100644 index 000000000..741ae8742 --- /dev/null +++ b/environments/ether0/data/example_rollouts.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are a scientific reasoning agent. Think step by step, then place your final answer inside tags. For example: CCO","role":"system","type":"message"},{"content":"Given that molecule [N+](=O)([O-])C1=CC=CC2=CC=CC=C12 is toxic, select from below the molecule most expected to not have this characteristic:\n[N+](=O)([O-])C1=C(C)C(=CC=C1)[N+](=O)[O-]\n[N+](=O)([O-])C1=CC=CC2=N[Se]N=C21\nIC1=C(C=CC=C1[N+](=O)[O-])[N+](=O)[O-]\nBrC=1C=C(C2=CC=CC=C2C1)[N+](=O)[O-]","role":"user","type":"message"}],"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":null,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_c168879a0ea94ff7b90f73afa8441065","created_at":1772870840.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"futurehouse/ether0","object":"response","output":[{"id":"msg_47658202dda4470182133cd06ede0747","content":[{"annotations":[],"text":"<|think_start|>Okay, let's see. The question is about identifying which of the given molecules is least likely to have the toxic characteristic of the original molecule, which is [N+](=O)([O-])C1=CC=CC2=CC=CC=C12. The original molecule's SMILES notation is a nitro group ([N+](=O)[O-]) attached to a naphthalene ring system.\n\nThe options are:\n\n1. [N+](=O)([O-])C1=C(C)C(=CC=C1)[N+](=O)[O-]\n2. [N+](=O)([O-])C1=CC=CC2=N[Se]N=C21\n3. IC1=C(C=CC=C1[N+](=O)[O-])[N+](=O)[O-]\n4. BrC=1C=C(C2=CC=CC=C2C1)[N+](=O)[O-]\n\nThe fourth option is the least toxic. It is a brominated nitro-substituted naphthalene, which is less toxic compared to the original molecule. Bromination does not significantly alter toxicity, and the nitro group is a common functional group in such compounds. Therefore, the answer is the fourth option.<|think_end|><|answer_start|>BrC=1C=C(C2=CC=CC=C2C1)[N+](=O)[O-]<|answer_end|>","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":null,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":null,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":390,"input_tokens_details":{"cached_tokens":0},"output_tokens":604,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":994},"user":null},"reward":1.0,"extracted_answer":"BrC=1C=C(C2=CC=CC=C2C1)[N+](=O)[O-]","eval_function":"str_eval","problem_type":"property-cat-safety/delta-toxic","_ng_task_index":4,"_ng_rollout_index":0} +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are a scientific reasoning agent. Think step by step, then place your final answer inside tags. For example: CCO","role":"system","type":"message"},{"content":"Identify which of the following molecules will most likely have a rat LD50 oral in mg/kg of 6.09:\nC(CS)(=O)O.C(O)CN\nFCC(=O)O\nClCC(=O)O","role":"user","type":"message"}],"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":null,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_9a19a3aaf458425ea27fefd3dbe4b433","created_at":1772870840.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"futurehouse/ether0","object":"response","output":[{"id":"msg_5acde9eeee9d49418eb1b2f821105e5c","content":[{"annotations":[],"text":"<|think_start|>Okay, let's see. I need to figure out which of these three molecules has an oral LD50 in rats of 6.09 mg/kg. LD50 is the lethal dose that kills 50% of the test subjects, so a lower LD50 means the compound is more toxic. 6.09 mg/kg is a pretty low value, so the compound is quite toxic.\n\nFirst, let's look at the structures of the given molecules:\n\n1. **C(CS)(=O)O.C(O)CN**\n - This is a mixture of a thioether derivative and a carbonyl compound, which is unlikely to have an oral LD50 of 6.09 mg/kg.\n\n2. **FCC(=O)O**\n - This is fluoroacetic acid, which is highly toxic. Fluoroacetic acid inhibits the enzyme acetyl-CoA carboxylate, leading to a fatality. Its oral LD50 in rats is around 100–500 mg/kg, which is much lower than 6.09 mg/kg.\n\n3. **ClCC(=O)O**\n - This is chloroacetic acid, which is also highly toxic. Chloroacetic acid is a known toxic compound with an oral LD50 in rats around 76–100 mg/kg, which is much lower than 6.09 mg/kg.\n\nComparing the options, the second molecule, **FCC(=O)O (fluoroacetic acid)**, is the most likely to have an oral LD50 of 6.09 mg/kg.<|think_end|><|answer_start|>FCC(=O)O<|answer_end|>","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":null,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":null,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":192,"input_tokens_details":{"cached_tokens":0},"output_tokens":692,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":884},"user":null},"reward":1.0,"extracted_answer":"FCC(=O)O","eval_function":"str_eval","problem_type":"property-regression-ld50","_ng_task_index":3,"_ng_rollout_index":0} +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are a scientific reasoning agent. Think step by step, then place your final answer inside tags. For example: CCO","role":"system","type":"message"},{"content":"Among the following, which molecule is predicted to have a permeability in MDCK cells in MDR1-MDCK efflux ratio B-A/A-B close to 1.04?\nFC(C1=NC(C(NC2C(OCC)=CC3=NN(CCC(O)(C)C)C=C3C=2)=O)=CC=C1)F\nC(NC1=CC2=CN(N=C2C=C1C(C)(O)C)CCC(C)(O)C)(=O)C1N=C(C=CC=1)C(F)(F)F\nC12C=C(NC(=O)C3C=CN(C(F)F)N=3)C(OC)=CC1=NN(C=2)CCC(C)(C)O","role":"user","type":"message"}],"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":null,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_f7739b7230a34edbb03027fec63180fa","created_at":1772870843.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"futurehouse/ether0","object":"response","output":[{"id":"msg_a1730cc889fb4378a6ea4db597c2764d","content":[{"annotations":[],"text":"<|think_start|>Okay, let's see. I need to figure out which of these three molecules has an MDCK permeability efflux ratio around 1.04. MDCK cells, especially MDR1-MDCK, are used to assess a compound's susceptibility to efflux pumps, typically involving P-glycoprotein (P-gp). A lower efflux ratio means the compound is less of a substrate for efflux pumps, so it's less easily transported. A ratio around 1.04 is relatively low, indicating the compound is not significantly effluxed.\n\nFirst, let's look at the structures of the given molecules:\n\n1. **First Molecule:** FC(C1=NC(C(NC2C(OCC)=CC3=NN(CCC(O)(C)C)C=C3C=2)=O)=CC=C1)F\n - Contains a fluorinated pyridine ring, an ester group, and a triazole ring.\n - Key properties: Fluorine atoms increase lipophilicity, and the triazole ring might be stable. The ester group could be susceptible to hydrolysis, but the overall structure might reduce efflux.\n\n2. **Second Molecule:** C(NC1=CC2=CN(N=C2C=C1C(C)(O)C)CCC(C)(O)C)(=O)C1N=C(C=CC=1)C(F)(F)F\n - Contains a trifluoromethyl group, a morpholine-like ring, and hydroxyl groups.\n - Key properties: High molecular weight and multiple hydroxyl groups increase polarity, reducing efflux.\n\n3. **Third Molecule:** C12C=C(NC(=O)C3C=CN(C(F)F)N=3)C(OC)=CC1=NN(C=2)CCC(C)(C)O\n - Contains a bicyclic structure, an ester group, and a triazole ring.\n - Key properties: Rigid structure, fluorinated groups, and a triazole ring might reduce efflux.\n\nComparing the structures, the first molecule is most likely to have an efflux ratio around 1.04.<|think_end|><|answer_start|>FC(C1=NC(C(NC2C(OCC)=CC3=NN(CCC(O)(C)C)C=C3C=2)=O)=CC=C1)F<|answer_end|>","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":null,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":null,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":424,"input_tokens_details":{"cached_tokens":0},"output_tokens":966,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":1390},"user":null},"reward":1.0,"extracted_answer":"FC(C1=NC(C(NC2C(OCC)=CC3=NN(CCC(O)(C)C)C=C3C=2)=O)=CC=C1)F","eval_function":"str_eval","problem_type":"property-regression-adme/log_mdr1-mdck_er","_ng_task_index":1,"_ng_rollout_index":0} +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are a scientific reasoning agent. Think step by step, then place your final answer inside tags. For example: CCO","role":"system","type":"message"},{"content":"A compound with formula C30H44O7 was isolated from Nerium oleander L.. What is a plausible SMILES for it given this organism?\nspecies: Nerium oleander L.\ntaxonomicGroup: Angiosperms\nhabitat: Temperate and subtropical areas, along river banks, stream beds in river valleys, roadsides, parks, coastal gardens\nlifestyle: Free-living\nmetabolicType: Photoautotrophic\ncellularOrganization: Multicellular\npresenceOfOrganelles: Mitochondria, chloroplasts\ncellWallComposition: Cellulose","role":"user","type":"message"}],"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":null,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_7ee539ba3e8c449c8585dfd4176b957c","created_at":1772870843.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"futurehouse/ether0","object":"response","output":[{"id":"msg_6ca41cded5ae468d827d4deb822bb46f","content":[{"annotations":[],"text":"<|think_start|>Okay, let's see... The user is asking about a compound with the formula C30H44O7 isolated from Nerium oleander L., which is an angiosperm. They want a plausible SMILES structure for it. Nerium oleander is known to produce cardiac glycosides, which typically consist of a steroid nucleus (aglycone) attached to a sugar moiety.\n\nThe molecular formula C30H44O7 suggests a cardiac glycoside structure. Cardiac glycosides typically have a steroid nucleus (around 23-24 carbons) and a sugar part. The steroid part could be a cardiac aglycone like digitoxigenin (C23H38O14), and the sugar part could be a deoxy sugar like digitoxose (C6H10O4).\n\nLet's calculate the degrees of unsaturation:\nDegrees of unsaturation = (2C + 2 - H - X + N)/2 = (2*30 + 2 - 44)/2 = 4.\nThis suggests 4 degrees of unsaturation, which could be rings or double bonds.\n\nGiven the formula C30H44O7, let's consider a cardiac glycoside with a steroid aglycone and a deoxy sugar. The steroid part could be a cardiac aglycone like digitoxigenin (C23H38O14), and the sugar part could be a deoxy sugar like digitoxose (C6H10O4).\n\nA plausible SMILES structure for a cardiac glycoside from Nerium oleander L. with the formula C30H44O7 could be:\n\n```\nCC(C)C1CCC2C1C3CCC4C5C(CCC(C5CCC4(C3CC2O)C)OC6C(C(C(C(O6)CO)O)O)O)OC7C(C(C(C(O7)CO)O)O)O\n```\n\nThis structure includes a steroid nucleus with a lactone ring and a deoxy sugar moiety, fitting the given formula and the known chemistry of Nerium oleander.<|think_end|><|answer_start|>CC(C)C1CCC2C1C3CCC4C5C(CCC(C5CCC4(C3CC2O)C)OC6C(C(C(C(O6)CO)O)O)O)OC7C(C(C(C(O7)CO)O)O)O<|answer_end|>","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":null,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":null,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":322,"input_tokens_details":{"cached_tokens":0},"output_tokens":1044,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":1366},"user":null},"reward":0.0,"extracted_answer":"CC(C)C1CCC2C1C3CCC4C5C(CCC(C5CCC4(C3CC2O)C)OC6C(C(C(C(O6)CO)O)O)O)OC7C(C(C(C(O7)CO)O)O)O","eval_function":"formula_eval","problem_type":"molecule-formula","_ng_task_index":2,"_ng_rollout_index":0} +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are a scientific reasoning agent. Think step by step, then place your final answer inside tags. For example: CCO","role":"system","type":"message"},{"content":"Generate a SMILES representation for a molecule containing groups: charged and nitro. It should also have formula C13H12N6O5.","role":"user","type":"message"}],"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":null,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_3f0f2705a99d4a089894302ade08a8a0","created_at":1772870856.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"futurehouse/ether0","object":"response","output":[{"id":"msg_364a6d3f16b54a00bdf23ce2f50f5d8f","content":[{"annotations":[],"text":"<|think_start|>Okay, let's see... The user is asking about a chemical molecule that includes a charged group and a nitro group, and the formula is C13H12N6O5. I need to figure out a valid structure that fits this formula.\n\nFirst, consider the nitro group (NO2), which uses one nitrogen and two oxygens. The formula is C13H12N6O5. The nitro group leaves C13H11N5O3. To fit the formula, consider benzimidazole (C7H6N2) with a nitro group (NO2) → C7H5N3O2. Adding a guanidine group (C1H3N3) → C8H8N6O2. We need 5 more carbons and 4 more hydrogens.\n\nConsider a benzene ring with a nitro group and a guanidine group. Benzene (C6H6) with nitro (NO2) → C6H5N1O2. Adding guanidine (C1H3N3) → C7H8N4O2. We need 6 more carbons and 4 more hydrogens.\n\nConsider a biphenyl dinitro compound. Biphenyl (C12H10) with two nitro groups → C12H8N2O4. Adding a guanidine group (C1H3N3) → C13H11N5O4. We need one more nitrogen and one more oxygen.\n\nConsider a nitro group and a guanidine group on a benzene ring. Benzene (C6H6) with nitro (NO2) → C6H5N1O2. Adding guanidine (C1H3N3) → C7H8N4O2. Adding a phenyl group (C6H5) → C13H12N4O2. We need two more nitrogens and three more oxygens.\n\nConsider a nitro group and a guanidine group on a benzene ring with a carboxylic acid group. Benzene (C6H6) with nitro (NO2) → C6H5N1O2. Adding guanidine (C1H3N3) → C7H8N4O2. Adding a carboxylic acid (COOH) → C8H8N4O4. Adding a phenyl (C6H5) → C14H12N4O4. We need two more nitrogens and one more oxygen.\n\nConsider a nitro group and a guanidine group on a benzene ring with a carboxylic acid group. Benzene (C6H6) with nitro (NO2) → C6H5N1O2. Adding guanidine (C1H3N3) → C7H8N4O2. Adding a carboxylic acid (COOH) → C8H8N4O4. Adding a phenyl (C6H5) → C14H12N4O4. We need two more nitrogens and one more oxygen.\n\nConsider a nitro group and a guanidine group on a benzene ring with a carboxylic acid group. Benzene (C6H6) with nitro (NO2) → C6H5N1O2. Adding guanidine (C1H3N3) → C7H8N4O2. Adding a carboxylic acid (COOH) → C8H8N4O4. Adding a phenyl (C6H5) → C14H12N4O4. We need two more nitrogens and one more oxygen.\n\nAt last, consider nitrobenzene (C6H5N1O2) with a guanidine group (C1H3N3) → C7H8N4O2. Adding a nitro group (NO2) → C7H7N5O4. Adding a carboxylic acid (COOH) → C8H7N5O6. Adding a pentylamine (C5H11N) → C13H18N6O6. We need to lose 6 hydrogens.\n\nAt last, consider nitrobenzene (C6H5N1O2) with a guanidine group (C1H3N3) → C7H8N4O2. Adding a phenyl (C6H5) → C13H12N4O2. Adding a nitro group (NO2) → C13H11N5O4. Adding a NH2 group → C13H12N6O4. We need one more oxygen. Adding a hydroxyl group → C13H12N6O5.\n\nConsider nitrobenzene (C6H5N1O2) with a guanidine group (C1H3N3) → C7H8N4O2. Adding phenyl (C6H5) → C13H12N4O2. Adding a nitro group (NO2) → C13H11N5O4. Adding a NH2 group → C13H12N6O4. Adding a hydroxyl group → C13H12N6O5.\n\nThe SMILES for this structure is:\n\n**O=[N+]([O-])c1ccc(NC(=N)N)cc1c2cc([N+](=O)[O-])c(N)cc2O**\n\nThis structure includes a charged group and a nitro group, and the formula is C13H12N6O5.<|think_end|><|answer_start|>O=[N+]([O-])c1ccc(NC(=N)N)cc1Oc2cc([N+](=O)[O-])c(N)cc2<|answer_end|>","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":null,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":null,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":146,"input_tokens_details":{"cached_tokens":0},"output_tokens":2462,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":2608},"user":null},"reward":1.0,"extracted_answer":"O=[N+]([O-])c1ccc(NC(=N)N)cc1Oc2cc([N+](=O)[O-])c(N)cc2","eval_function":"functional_group_eval","problem_type":"functional-group","_ng_task_index":0,"_ng_rollout_index":0} diff --git a/environments/ether0/prepare.py b/environments/ether0/prepare.py new file mode 100644 index 000000000..de8944005 --- /dev/null +++ b/environments/ether0/prepare.py @@ -0,0 +1,142 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import sys +from pathlib import Path + +from datasets import load_dataset + + +_OPEN_ENDED = { + "molecule-name", + "molecule-completion", + "molecule-formula", + "simple-formula", + "reaction-prediction", + "functional-group", + "reaction-name", + "retro-synthesis", + "oracle-solubility", +} + +_LABELS = ["A", "B", "C", "D", "E", "F"] + +_MATH_TEMPLATE = ( + "Solve the following chemistry problem step by step. " + "Put your answer inside \\boxed{{}}.\n\n{problem}\n\n" + "Remember to put your answer inside \\boxed{{}}." +) + +_MCQ_TEMPLATE = ( + "Answer the following multiple choice question. " + "The last line of your response should be in the following format: " + "'Answer: A/B/C/D' (e.g. 'Answer: A').\n\n{problem}" +) + + +def _apply_boxed_letter_format(problem: str, problem_type: str, ideal: str | None) -> tuple[str, str | None, dict]: + if problem_type in _OPEN_ENDED or problem_type.split("/")[0] in _OPEN_ENDED: + return _MATH_TEMPLATE.format(problem=problem), ideal, {} + + lines = [line for line in problem.strip().split("\n") if line.strip()] + q_lines = [line for line in lines if " " in line] + c_lines = [line for line in lines if " " not in line] + labels = _LABELS[: len(c_lines)] + choices = {lbl: ch for lbl, ch in zip(labels, c_lines)} + + answer_label = next((lbl for lbl, ch in zip(labels, c_lines) if ch == ideal), None) + labeled = "\n".join(f"{lbl}. {ch}" for lbl, ch in zip(labels, c_lines)) + prompt = _MCQ_TEMPLATE.format(problem=f"{' '.join(q_lines)}\n\n{labeled}") + return prompt, answer_label, {"choices": choices} + + +def format_row(row: dict, boxed_letter_format: bool = False) -> dict: + problem = row["problem"] + problem_type = row.get("problem_type", "") + ideal = row.get("ideal") + extra_meta: dict = {} + + if boxed_letter_format: + problem, ideal, extra_meta = _apply_boxed_letter_format(problem, problem_type, ideal) + input_messages = [{"role": "user", "content": problem}] + else: + input_messages = [ + { + "role": "system", + "content": ( + "You are a scientific reasoning agent. " + "Think step by step, then place your final answer inside tags. " + "For example: CCO" + ), + }, + {"role": "user", "content": problem}, + ] + + return { + "responses_create_params": {"input": input_messages}, + "verifier_metadata": { + "solution": row["solution"], + "problem_type": problem_type, + "ideal": ideal, + "id": row.get("id"), + **extra_meta, + }, + "agent_ref": {"type": "responses_api_agents", "name": "ether0_simple_agent"}, + } + + +def main() -> None: + parser = argparse.ArgumentParser(description="Download and prepare ether0-benchmark for NeMo Gym") + parser.add_argument("--output", required=True, help="Output JSONL path") + parser.add_argument("--problem-types", nargs="*", default=None, help="Problem type prefixes to include") + parser.add_argument("--limit", type=int, default=None, help="Max rows to output") + parser.add_argument( + "--boxed-letter-format", + action="store_true", + help="Use boxed/letter prompt format: open-ended -> \\boxed{}, MCQ -> Answer: LETTER", + ) + args = parser.parse_args() + + ds = load_dataset("futurehouse/ether0-benchmark", split="test") + + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + + count = 0 + with open(output_path, "w") as fout: + for row in ds: + if args.problem_types: + pt = row.get("problem_type", "") + if not any(pt.startswith(p) for p in args.problem_types): + continue + + fout.write( + json.dumps(format_row(row, boxed_letter_format=args.boxed_letter_format), ensure_ascii=False) + "\n" + ) + count += 1 + + if args.limit and count >= args.limit: + break + + print(f"Wrote {count} rows to {output_path}", file=sys.stderr) + + +# python scripts/prepare_ether0.py --output data/val.jsonl +# python scripts/prepare_ether0.py --output data/example.jsonl --limit 5 +# python scripts/prepare_ether0.py --output data/val_reactions.jsonl --problem-types reaction-prediction retro-synthesis +# python scripts/prepare_ether0.py --output data/val_boxed_letter.jsonl --boxed-letter-format +if __name__ == "__main__": + main()