From faebb718665e542a031993721d64592a533f5efb Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Fri, 29 May 2026 15:46:18 -0700 Subject: [PATCH] init Signed-off-by: Christian Munley --- environments/blackjack/README.md | 35 +++++++++++++ environments/blackjack/__init__.py | 0 environments/blackjack/config.yaml | 27 ++++++++++ environments/blackjack/data/.gitignore | 5 ++ environments/blackjack/data/example.jsonl | 5 ++ .../blackjack/data/example_metrics.json | 38 ++++++++++++++ .../blackjack/data/example_rollouts.jsonl | 5 ++ environments/blackjack/prepare.py | 49 +++++++++++++++++++ 8 files changed, 164 insertions(+) create mode 100644 environments/blackjack/README.md create mode 100644 environments/blackjack/__init__.py create mode 100644 environments/blackjack/config.yaml create mode 100644 environments/blackjack/data/.gitignore create mode 100644 environments/blackjack/data/example.jsonl create mode 100644 environments/blackjack/data/example_metrics.json create mode 100644 environments/blackjack/data/example_rollouts.jsonl create mode 100644 environments/blackjack/prepare.py diff --git a/environments/blackjack/README.md b/environments/blackjack/README.md new file mode 100644 index 000000000..adfecbb54 --- /dev/null +++ b/environments/blackjack/README.md @@ -0,0 +1,35 @@ +# Blackjack + +Multi-step gymnasium-style environment. + +Model hits or stands using `` tags until the hand ends. Game state managed per session. + +Example data provided in `data/example.jsonl` (system prompt only, no verifier_metadata needed). No train/validation data. + +## Run + +```bash +ng_run "+config_paths=[environments/blackjack/config.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]" +``` + +## Data + +Each game is generated on the fly during `reset()`, so every row in `example.jsonl` is identical. To create more data, duplicate the row. Each rollout gets a fresh random deal. Use `num_repeats` in the YAML config or the `+num_repeats` CLI flag to control how many games per row. + +## Collect rollouts + +```bash +ng_collect_rollouts \ + +agent_name=blackjack_gymnasium_agent \ + +input_jsonl_fpath=environments/blackjack/data/example.jsonl \ + +output_jsonl_fpath=results/blackjack_rollouts.jsonl +``` + + +## Prepare training data + +```bash +python environments/blackjack/prepare.py --size 1000 +``` + +Each row in the generated JSONL is identical (same as `example.jsonl`) — a fresh game is dealt on the resources server side per rollout. Use `--size` to control how many rows. diff --git a/environments/blackjack/__init__.py b/environments/blackjack/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/environments/blackjack/config.yaml b/environments/blackjack/config.yaml new file mode 100644 index 000000000..dd7dc9215 --- /dev/null +++ b/environments/blackjack/config.yaml @@ -0,0 +1,27 @@ +blackjack: + resources_servers: + blackjack: + entrypoint: app.py + domain: games + verified: false + description: Blackjack. Model hits or stands. Reward +1 win, 0 draw, -1 loss/bust. + value: Example gymnasium-style multi-step environment +blackjack_gymnasium_agent: + responses_api_agents: + gymnasium_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: blackjack + model_server: + type: responses_api_models + name: policy_model + max_steps: 5 + datasets: + - name: example + type: example + jsonl_fpath: environments/blackjack/data/example.jsonl + - name: train + type: train + jsonl_fpath: environments/blackjack/data/train.jsonl + license: Apache 2.0 diff --git a/environments/blackjack/data/.gitignore b/environments/blackjack/data/.gitignore new file mode 100644 index 000000000..4424b6fde --- /dev/null +++ b/environments/blackjack/data/.gitignore @@ -0,0 +1,5 @@ +*train.jsonl +*validation.jsonl +*train_prepare.jsonl +*validation_prepare.jsonl +*example_prepare.jsonl diff --git a/environments/blackjack/data/example.jsonl b/environments/blackjack/data/example.jsonl new file mode 100644 index 000000000..f48a084e1 --- /dev/null +++ b/environments/blackjack/data/example.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "system", "content": "You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."}, {"role": "user", "content": "Deal me in."}]}, "agent_ref": {"type": "responses_api_agents", "name": "blackjack_gymnasium_agent"}} +{"responses_create_params": {"input": [{"role": "system", "content": "You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."}, {"role": "user", "content": "Deal me in."}]}, "agent_ref": {"type": "responses_api_agents", "name": "blackjack_gymnasium_agent"}} +{"responses_create_params": {"input": [{"role": "system", "content": "You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."}, {"role": "user", "content": "Deal me in."}]}, "agent_ref": {"type": "responses_api_agents", "name": "blackjack_gymnasium_agent"}} +{"responses_create_params": {"input": [{"role": "system", "content": "You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."}, {"role": "user", "content": "Deal me in."}]}, "agent_ref": {"type": "responses_api_agents", "name": "blackjack_gymnasium_agent"}} +{"responses_create_params": {"input": [{"role": "system", "content": "You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."}, {"role": "user", "content": "Deal me in."}]}, "agent_ref": {"type": "responses_api_agents", "name": "blackjack_gymnasium_agent"}} diff --git a/environments/blackjack/data/example_metrics.json b/environments/blackjack/data/example_metrics.json new file mode 100644 index 000000000..ca1f0810a --- /dev/null +++ b/environments/blackjack/data/example_metrics.json @@ -0,0 +1,38 @@ +{ + "name": "example", + "type": "example", + "jsonl_fpath": "resources_servers/blackjack/data/example.jsonl", + "num_repeats": 1, + "gitlab_identifier": null, + "huggingface_identifier": null, + "license": null, + "Number of examples": 5, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 5, + "Average": 30.0, + "Min": 30.0, + "Max": 30.0, + "Standard deviation": 0.0 + }, + "Number of turns": { + "Total # non-null values": 5, + "Average": 1.0, + "Min": 1.0, + "Max": 1.0, + "Standard deviation": 0.0 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Standard deviation": 0.0 + } +} \ No newline at end of file diff --git a/environments/blackjack/data/example_rollouts.jsonl b/environments/blackjack/data/example_rollouts.jsonl new file mode 100644 index 000000000..6175f946f --- /dev/null +++ b/environments/blackjack/data/example_rollouts.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag.","role":"system","type":"message"},{"content":"Deal me in.","role":"user","type":"message"}],"instructions":null,"max_output_tokens":1024,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":0.0,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_0816d85808c14ee38cb42be5b1ae4863","created_at":1775592436.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"Qwen/Qwen3-4B-Instruct-2507","object":"response","output":[{"prompt_token_ids":[151644,872,198,7771,1424,25,508,19,11,220,17,60,284,220,21,198,93909,4933,25,220,19,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198],"generation_token_ids":[27,1311,29,2685,522,1311,29,151645],"generation_log_probs":[-0.0001250427303602919,-1.5497195136049413e-6,-1.1920928244535389e-7,-0.693234920501709,0.0,0.0,-3.576278118089249e-7,-3.2186455882765586e-6],"id":"msg_b266aeb4d0be4afc80b0931cb97cd083","content":[{"annotations":[],"text":"stand","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":0.0,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":1024,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":44,"input_tokens_details":{"cached_tokens":0},"output_tokens":8,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":52},"user":null},"reward":-1.0,"terminated":true,"truncated":false,"info":{"result":"loss","player":"[4, 2]","player_value":6,"dealer":"[4, 2, 2, K]","dealer_value":18},"_ng_task_index":0,"_ng_rollout_index":1,"agent_ref":{"name":"blackjack_agent"}} +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag.","role":"system","type":"message"},{"content":"Deal me in.","role":"user","type":"message"}],"instructions":null,"max_output_tokens":1024,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":0.0,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_c018550a3d174fe987939919762706f8","created_at":1775592436.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"Qwen/Qwen3-4B-Instruct-2507","object":"response","output":[{"prompt_token_ids":[151644,872,198,7771,1424,25,508,24,11,220,24,60,284,220,16,23,198,93909,4933,25,220,18,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198],"generation_token_ids":[27,1311,29,2685,522,1311,29,151645],"generation_log_probs":[-1.5497195136049413e-6,-1.6689286894688848e-6,0.0,-0.00012838016846217215,0.0,0.0,-2.3841855067985307e-7,-3.576272320060525e-6],"id":"msg_02f9a7cb0eac41a8b13c2c690534a43b","content":[{"annotations":[],"text":"stand","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":0.0,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":1024,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":45,"input_tokens_details":{"cached_tokens":0},"output_tokens":8,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":53},"user":null},"reward":1.0,"terminated":true,"truncated":false,"info":{"result":"win","player":"[9, 9]","player_value":18,"dealer":"[3, 9, 3, Q]","dealer_value":25},"_ng_task_index":0,"_ng_rollout_index":2,"agent_ref":{"name":"blackjack_agent"}} +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag.","role":"system","type":"message"},{"content":"Deal me in.","role":"user","type":"message"}],"instructions":null,"max_output_tokens":1024,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":0.0,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_1f8b38af629d4252818b94897f4e599a","created_at":1775592436.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"Qwen/Qwen3-4B-Instruct-2507","object":"response","output":[{"prompt_token_ids":[151644,872,198,7771,1424,25,508,19,11,619,60,284,220,16,19,198,93909,4933,25,362,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198],"generation_token_ids":[27,1311,29,2685,522,1311,29,151645],"generation_log_probs":[-0.000011920858014491387,-2.0265558760002023e-6,0.0,-0.038049791008234024,0.0,0.0,-3.576278118089249e-7,-3.3378546504536644e-6],"id":"msg_9046c12edb9d48d992ce6cd8dbeed1eb","content":[{"annotations":[],"text":"stand","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":0.0,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":1024,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":43,"input_tokens_details":{"cached_tokens":0},"output_tokens":8,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":51},"user":null},"reward":-1.0,"terminated":true,"truncated":false,"info":{"result":"loss","player":"[4, J]","player_value":14,"dealer":"[A, J]","dealer_value":21},"_ng_task_index":0,"_ng_rollout_index":4,"agent_ref":{"name":"blackjack_agent"}} +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag.","role":"system","type":"message"},{"content":"Deal me in.","role":"user","type":"message"}],"instructions":null,"max_output_tokens":1024,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":0.0,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_1d3acaf11fe142ffaaddc9925de94d78","created_at":1775592436.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"Qwen/Qwen3-4B-Instruct-2507","object":"response","output":[{"prompt_token_ids":[151644,872,198,7771,1424,25,508,22,11,220,16,15,60,284,220,16,22,198,93909,4933,25,619,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198],"generation_token_ids":[27,1311,29,2685,522,1311,29,151645],"generation_log_probs":[-0.00004053033626405522,-1.6689286894688848e-6,0.0,-0.0011742371134459972,0.0,0.0,-2.3841855067985307e-7,-2.50339189733495e-6],"id":"msg_b2b71c2b4b5d4e059b77886de80db892","content":[{"annotations":[],"text":"stand","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":0.0,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":1024,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":45,"input_tokens_details":{"cached_tokens":0},"output_tokens":8,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":53},"user":null},"reward":-1.0,"terminated":true,"truncated":false,"info":{"result":"loss","player":"[7, 10]","player_value":17,"dealer":"[J, 5, 3]","dealer_value":18},"_ng_task_index":0,"_ng_rollout_index":3,"agent_ref":{"name":"blackjack_agent"}} +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag.","role":"system","type":"message"},{"content":"Deal me in.","role":"user","type":"message"}],"instructions":null,"max_output_tokens":1024,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":0.0,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_a97b2558ac874f2ba5684a67e9b941c2","created_at":1775592436.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"Qwen/Qwen3-4B-Instruct-2507","object":"response","output":[{"prompt_token_ids":[151644,872,198,7771,1424,25,508,22,11,1207,60,284,220,16,22,198,93909,4933,25,220,17,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198],"generation_token_ids":[27,1311,29,22492,522,1311,29,151645],"generation_log_probs":[-2.0265558760002023e-6,-1.6689286894688848e-6,-1.1920928244535389e-7,-0.10021035373210907,0.0,0.0,0.0,-5.722029527532868e-6],"id":"msg_601a2716de0a4eb38494d2131674c825","content":[{"annotations":[],"text":"hit","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"},{"prompt_token_ids":[151644,872,198,7771,1424,25,508,22,11,1207,60,284,220,16,22,198,93909,4933,25,220,17,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198,27,1311,29,22492,522,1311,29,151645,198,151644,872,198,7771,1424,25,508,22,11,1207,11,220,18,60,284,220,17,15,198,93909,4933,25,220,17,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198],"generation_token_ids":[27,1311,29,2685,522,1311,29,151645],"generation_log_probs":[-1.1920928244535389e-7,-2.3841855067985307e-7,-7.152555099310121e-7,0.0,0.0,-1.1920928244535389e-7,0.0,-1.6689286894688848e-6],"id":"msg_7de40025e0a7414594284d76461fe177","content":[{"annotations":[],"text":"stand","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":0.0,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":1024,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":144,"input_tokens_details":{"cached_tokens":0},"output_tokens":16,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":160},"user":null},"reward":-1.0,"terminated":true,"truncated":false,"info":{"result":"loss","player":"[]","player_value":0,"dealer":"[8, A]","dealer_value":19},"_ng_task_index":0,"_ng_rollout_index":0,"agent_ref":{"name":"blackjack_agent"}} diff --git a/environments/blackjack/prepare.py b/environments/blackjack/prepare.py new file mode 100644 index 000000000..d33792fcb --- /dev/null +++ b/environments/blackjack/prepare.py @@ -0,0 +1,49 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Generate a blackjack dataset by duplicating the example.jsonl template. + +Each game is dealt on the fly during the resources server's `reset()`, so +every row is identical. This script just sizes the output JSONL. + +Usage: + python environments/blackjack/prepare.py --size 1000 + python environments/blackjack/prepare.py --size 1000 --split train +""" + +import argparse +import json +from pathlib import Path + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--size", type=int, default=1000, help="Number of rows to generate.") + parser.add_argument("--split", default="train", help="Output split name (train, validation, etc.)") + args = parser.parse_args() + + here = Path(__file__).parent + template_path = here / "data" / "example.jsonl" + with template_path.open() as f: + template = json.loads(f.readline()) + + output_path = here / "data" / f"{args.split}.jsonl" + with output_path.open("w") as f: + for _ in range(args.size): + f.write(json.dumps(template) + "\n") + + print(f"Wrote {args.size} rows to {output_path}") + + +if __name__ == "__main__": + main()