neulab · yueqis · Mar 9, 2026 · Feb 26, 2026 · Feb 28, 2026 · Feb 28, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,133 @@
+# Agent Data Protocol - Repository Guidelines
+
+This document captures key patterns and best practices for contributing to the Agent Data Protocol repository.
+
+## Repository Structure
+
+```
+agent-data-protocol/
+├── datasets/           # Dataset implementations
+│   └── $DATASET_NAME/
+│       ├── README.md
+│       ├── extract_raw.py
+│       ├── raw_to_standardized.py
+│       ├── schema_raw.py (optional)
+│       ├── api.py (optional)
+│       ├── sample_raw.json
+│       ├── sample_std.json
+│       ├── sample_sft.json
+│       └── sample_sft/
+│           └── sample_sft_$AGENT.json
+├── agents/             # Agent-specific SFT converters
+├── schema/             # ADP standardized format definitions
+├── scripts/            # Utility scripts
+└── tests/              # Validation tests
+```
+
+## Data Flow Pipeline
+
+```
+Raw Dataset      →  Standardized Format  →  Agent Specific SFT Format
+     ↓                   ↓                       ↓
+sample_raw.json  →  sample_std.json      →  sample_sft.json
+```
+
+## Key Requirements
+
+### File Naming
+- Only these JSON files are allowed in dataset directories:
+  - `sample_raw.json`
+  - `sample_std.json`
+  - `sample_sft.json`
+  - `generated_thoughts.json`
+- All JSON files MUST have a trailing newline
+
+### SFT Format Requirements
+
+**Critical**: Messages containing function call patterns MUST use `"from": "function_call"`, not `"from": "gpt"`.
+
+Function call patterns that trigger this requirement:
+- `<function=`
+- `<function_calls>`
+- `<invoke name=`
+
+Example correct format:
+```json
+{
+  "from": "function_call",
+  "value": "I'll run the command.\n\n<function=execute_bash>\n<parameter=command>ls -la</parameter>\n</function>"
+}
+```
+
+### Standardized Schema Components
+
+**Actions:**
+- `MessageAction`: Text-based communication
+- `CodeAction`: Code execution requests
+- `ApiAction`: API/function calls with `function` and `kwargs` fields
+
+**Observations:**
+- `TextObservation`: Text-based responses with `source` field (user/environment)
+- `WebObservation`: Web page content
+
+## Commands
+
+### Generate sample files
+```bash
+export MY_DATASET=your_dataset
+export PYTHONPATH=`pwd`:$PYTHONPATH
+
+# Extract raw data (5 samples)
+python datasets/$MY_DATASET/extract_raw.py | head -5 | python scripts/jsonl_to_json.py > datasets/$MY_DATASET/sample_raw.json
+
+# Convert to standardized format
+cat datasets/$MY_DATASET/sample_raw.json | python scripts/json_to_jsonl.py | python datasets/$MY_DATASET/raw_to_standardized.py | python scripts/jsonl_to_json.py > datasets/$MY_DATASET/sample_std.json
+
+# Convert to SFT format (OpenHands)
+cat datasets/$MY_DATASET/sample_std.json | python scripts/json_to_jsonl.py | python agents/openhands/std_to_sft.py --is_web=no --api_env=execute_bash | python scripts/jsonl_to_json.py > datasets/$MY_DATASET/sample_sft/sample_sft_openhands.json
+```
+
+### Run tests
+```bash
+# All tests
+python -m pytest tests/ -v
+
+# Tests for specific dataset
+python -m pytest tests/ -v -k "dataset_name"
+
+# Key validation tests
+python -m pytest tests/test_dataset_structure.py -v
+python -m pytest tests/test_datasets_from_parameter.py -v
+python -m pytest tests/test_standardized_schemas.py -v
+```
+
+## Common Issues
+
+1. **Missing trailing newline**: All JSON files must end with `\n`
+2. **Wrong `from` field**: Function calls must use `"from": "function_call"`
+3. **Extra JSON files**: Remove any temporary `.json` files before committing
+4. **Missing `sample_sft.json`**: Required at root level if `sample_std.json` exists
+
+## Post-Processing SFT Files
+
+If your SFT conversion produces `"from": "gpt"` for function calls, apply this fix:
+
+```python
+import json
+
+function_patterns = ['<function=', '<function_calls>', '<invoke name=']
+
+with open('sample_sft.json', 'r') as f:
+    data = json.load(f)
+
+for item in data:
+    for message in item.get('conversations', []):
+        value = message.get('value', '')
+        if any(p in value for p in function_patterns):
+            if message.get('from') == 'gpt':
+                message['from'] = 'function_call'
+
+with open('sample_sft.json', 'w') as f:
+    json.dump(data, f, indent=2)
+    f.write('\n')
+```
diff --git a/agents/openhands/std_to_sft.py b/agents/openhands/std_to_sft.py
@@ -30,24 +30,6 @@
 function_args = {"execute_ipython_cell": "code", "execute_bash": "command", "browser": "code"}
 
 
-def _build_thought_text(reasoning_content: str | None, description: str | None) -> str:
-    """Build thought text with reasoning_content wrapped in <think> tags.
-
-    Args:
-        reasoning_content: Extended chain-of-thought reasoning (wrapped in <think> tags)
-        description: Brief action description (included as plain text)
-
-    Returns:
-        Formatted thought text, or empty string if no content
-    """
-    parts = []
-    if reasoning_content:
-        parts.append(f"<think>\n{reasoning_content}\n</think>")
-    if description:
-        parts.append(description)
-    return "\n\n".join(parts) + "\n\n" if parts else ""
-
-
 def verify_args(required_args, optional_args, input_args):
     # all required args should be included
     for arg in required_args:
@@ -114,12 +96,7 @@ def standardized_event_to_openhands_message(
 
     if isinstance(event, ApiAction):
         PREV_BID = None
-        # Build thought text: reasoning_content wrapped in <think> tags, description as plain text
-        thought = _build_thought_text(
-            getattr(event, "reasoning_content", None),
-            event.description,
-        )
-
+        thought = event.description + "\n\n" if event.description else ""
         function_name = event.function
         arguments = {k: v for k, v in event.kwargs.items() if k not in ["element_id", "xpath"]}
 
@@ -211,12 +188,7 @@ def standardized_event_to_openhands_message(
         return {"from": "function_call", "value": f"{thought}{function_call}"}
 
     if isinstance(event, CodeAction):
-        # Build thought text: reasoning_content wrapped in <think> tags, description as plain text
-        thought = _build_thought_text(
-            getattr(event, "reasoning_content", None),
-            event.description,
-        )
-
+        thought = event.description + "\n\n" if event.description else ""
         function_name = action_function.get(event.language, f"execute_{event.language}")
         code_content = event.content
         if function_name not in openhands_default_tools:
@@ -228,12 +200,7 @@ def standardized_event_to_openhands_message(
         return {"from": "function_call", "value": f"{thought}{code_action}"}
 
     elif isinstance(event, MessageAction):
-        # Build thought text: reasoning_content wrapped in <think> tags, description as plain text
-        thought = _build_thought_text(
-            getattr(event, "reasoning_content", None),
-            event.description,
-        )
-
+        thought = event.description + "\n\n" if event.description else ""
         if "<finish>" in event.content and "</finish>" in event.content:
             match = re.search(r"<finish>(.*?)</finish>", event.content, re.DOTALL)
             content = match.group(1).strip()

diff --git a/agents/sweagent/std_to_sft.py b/agents/sweagent/std_to_sft.py
@@ -71,34 +71,13 @@ def verify_args(required_args, optional_args, input_args):
     return True
 
 
-def _build_thought_text(reasoning_content: str | None, description: str | None) -> str:
-    """Build thought text with reasoning_content wrapped in <think> tags.
-
-    Args:
-        reasoning_content: Extended chain-of-thought reasoning (wrapped in <think> tags)
-        description: Brief action description (included as plain text, not wrapped)
-
-    Returns:
-        Formatted thought text, or empty string if no content
-    """
-    parts = []
-    if reasoning_content:
-        parts.append(f"<think>\n{reasoning_content}\n</think>")
-    if description:
-        parts.append(description)
-    return "\n\n".join(parts) + "\n\n" if parts else ""
-
-
 def standardized_event_to_swe_message(
     id,
     event: ApiAction | CodeAction | MessageAction | TextObservation | WebObservation,
     api_sigs=None,
 ) -> dict:
     if isinstance(event, ApiAction):
-        thought = _build_thought_text(
-            getattr(event, "reasoning_content", None),
-            getattr(event, "description", None),
-        )
+        thought = f"<think>\n{event.description}\n</think>\n\n" if event.description else ""
         function_name = event.function
         arguments = {k: v for k, v in event.kwargs.items() if k not in ["element_id", "xpath"]}
 
@@ -123,10 +102,7 @@ def standardized_event_to_swe_message(
         raise ValueError(f"Undefined API: {event}")
 
     if isinstance(event, CodeAction):
-        thought = _build_thought_text(
-            getattr(event, "reasoning_content", None),
-            getattr(event, "description", None),
-        )
+        thought = f"<think>\n{event.description}\n</think>\n\n" if event.description else ""
         code_content = event.content
         if event.language != "bash":
             if event.language == "python" or event.language == "python3":
@@ -138,10 +114,7 @@ def standardized_event_to_swe_message(
         return {"from": "function_call", "value": f"{thought}{code_action}"}
 
     elif isinstance(event, MessageAction):
-        thought = _build_thought_text(
-            getattr(event, "reasoning_content", None),
-            getattr(event, "description", None),
-        )
+        thought = f"<think>\n{event.description}\n</think>\n\n" if event.description else ""
         # convert finish actions to submit actions
         if "<finish>" in event.content and "</finish>" in event.content:
             match = re.search(r"<finish>(.*?)</finish>", event.content, re.DOTALL)

diff --git a/datasets/coderforge_preview/LICENSE b/datasets/coderforge_preview/LICENSE
@@ -0,0 +1,35 @@
+# CoderForge-Preview Dataset - Source Repository Licenses
+
+This dataset contains trajectories generated from repositories under permissive
+open-source licenses. The following licenses are included in the source repositories:
+
+| License                                      | SPDX Identifier              |
+|----------------------------------------------|------------------------------|
+| MIT License                                  | MIT                          |
+| BSD 3-Clause                                 | BSD-3-Clause                 |
+| Apache License 2.0                           | Apache-2.0                   |
+| BSD License                                  | BSD                          |
+| BSD 2-Clause                                 | BSD-2-Clause                 |
+| Historical Permission Notice and Disclaimer  | HPND                         |
+| ISC License                                  | ISC                          |
+| PostgreSQL License                           | PostgreSQL                   |
+| Python Software Foundation License           | PSF-2.0                      |
+| Creative Commons Zero 1.0                    | CC0-1.0                      |
+| MIT No Attribution                           | MIT-0                        |
+| MIT-CMU License                              | MIT-CMU                      |
+| BSD 4-Clause                                 | BSD-4-Clause                 |
+| Dual: MIT and Apache-2.0                     | MIT AND Apache-2.0           |
+| Dual: Apache-2.0 and BSD-3-Clause            | Apache-2.0 AND BSD-3-Clause  |
+| Dual: BSD-3-Clause and MIT                   | BSD-3-Clause AND MIT         |
+
+## License Detection
+
+Licenses were identified using scancode-toolkit, the industry-standard license
+detection engine used by the Linux Foundation and the SPDX project. Each LICENSE
+file was retrieved from the repository at the specific commit SHA referenced by
+each task.
+
+## Source
+
+This license information is from the CoderForge-Preview blog post:
+https://www.together.ai/blog/coderforge-preview
diff --git a/datasets/coderforge_preview/README.md b/datasets/coderforge_preview/README.md
@@ -0,0 +1,33 @@
+# CoderForge-Preview Dataset
+
+## Description
+
+CoderForge-Preview is the largest open test-verified coding agent dataset designed for training efficient software engineering agents. The dataset contains agent trajectories solving real-world coding tasks, with all trajectories being test-verified for quality.
+
+Fine-tuning Qwen-3 32B on this dataset boosts SWE-Bench Verified performance from 23.0% to 59.4% pass@1, ranking #1 among open-data and #2 among open-weight models ≤32B parameters.
+
+The dataset focuses on:
+- Large-scale agentic data generation from 51K distinct open-source tasks
+- Long-horizon, multi-step SFT trajectories
+- Test-verified coding agent trajectories
+- Data collected using OpenHands agent framework
+
+## Paper Citation
+
+```bibtex
+@misc{CoderForge2026,
+  title = {CoderForge-Preview: SOTA Open Dataset for Training Efficient Agents},
+  author = {Ariyak, Alpay and Zhang, Junda and Wang, Junxiong and Zhu, Shang and Bianchi, Federico and Srivastava, Sanjana and Panda, Ashwinee and Bharti, Siddhant and Xu, Chenfeng and Heo, John and Wu, Xiaoxia Shirley and Zhou, James and Liang, Percy and Song, Leon and Zhang, Ce and Athiwaratkun, Ben and Zhou, Zhongzhu and Wu, Qingyang},
+  year = {2026},
+  month = feb,
+  publisher = {TogetherAI Blog},
+  url = {https://www.together.ai/blog/coderforge-preview},
+  note = {Project core leads: Alpay Ariyak; Zhongzhu Zhou; Qingyang Wu}
+}
+```
+
+## Dataset Information
+
+**Source URL**: https://huggingface.co/datasets/togethercomputer/CoderForge-Preview
+
+**License**: Apache-2.0
diff --git a/datasets/coderforge_preview/api.py b/datasets/coderforge_preview/api.py
@@ -0,0 +1,34 @@
+def str_replace_editor(
+    command: str,
+    path: str,
+    file_text: str = None,
+    old_str: str = None,
+    new_str: str = None,
+    insert_line: int = None,
+    view_range: list = None,
+) -> None:
+    """View, create, and edit files with this custom editing tool.
+
+    Args:
+    ----
+        command (str): The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.
+        path (str): Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.
+        file_text (str): Required parameter of `create` command, with the content of the file to be created.
+        old_str (str): Required parameter of `str_replace` command containing the string in `path` to replace.
+        new_str (str): Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.
+        insert_line (int): Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.
+        view_range (list): Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.
+
+    """
+    pass
+
+
+def think(thought: str):
+    """Log a thought for reasoning.
+
+    Args:
+    ----
+        thought (str): The thought to log.
+
+    """
+    pass
diff --git a/datasets/coderforge_preview/extract_raw.py b/datasets/coderforge_preview/extract_raw.py
@@ -0,0 +1,16 @@
+import json
+
+from datasets import load_dataset
+
+# Load all splits from the trajectories config
+dataset = load_dataset("togethercomputer/CoderForge-Preview", "trajectories")
+ids = {}
+split = "filtered_reward1"
+for item in dataset[split]:
+    id = str(item["trajectory_id"])
+    if id not in ids:
+        ids[id] = 0
+    item["id"] = f"{id}_{ids[id]}"
+    item["messages"] = json.loads(item["messages"])
+    ids[id] += 1
+    print(json.dumps(item))