Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions src/chap14_reinforcement_learning/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,38 @@ action = agent.place(state, enables)
* 增加模型评估与训练数据生成模块
* 可视化 Q 值输出(便于分析)


---

## Engineering Update (2026-05-05)

This commit keeps the original Reversi RL workflow but adds lightweight engineering improvements:

1. Configurable training parameters via environment variables:
- `REVERSI_SEED`
- `REVERSI_MAX_EPOCHS`
- `REVERSI_RENDER_INTERVAL`
- `REVERSI_MAX_STEPS`
- `REVERSI_REPORT_OUT`
2. JSON report export after run:
- default: `outputs/reversi_train_report.json`
3. Dry-run mode for quick verification without running Gym training:
- `REVERSI_DRY_RUN=1`

### PowerShell examples

```powershell
# 1) Dry run (no Gym training loop)
$env:REVERSI_DRY_RUN=1
$env:REVERSI_REPORT_OUT="outputs/reversi_train_report.json"
python .\reversi_main.py

# 2) Normal run with custom parameters
$env:REVERSI_DRY_RUN=0
$env:REVERSI_SEED=42
$env:REVERSI_MAX_EPOCHS=20
$env:REVERSI_RENDER_INTERVAL=5
$env:REVERSI_MAX_STEPS=60
$env:REVERSI_REPORT_OUT="outputs/reversi_train_report.json"
python .\reversi_main.py
```
324 changes: 185 additions & 139 deletions src/chap14_reinforcement_learning/reversi_main.py
Original file line number Diff line number Diff line change
@@ -1,141 +1,187 @@
import json
import os
import random
from pathlib import Path

# ==============================================
# 黑白棋(Reversi/Othello)强化学习训练代码
# 核心逻辑:黑棋采用随机策略,白棋采用自定义强化学习智能体(RL_QG_agent)
# 训练流程:环境注册→环境创建→智能体初始化→多局对战训练→结果统计→模型保存
# ==============================================

# 导入基础工具库
import random # 用于黑棋的随机落子策略
import gym # OpenAI Gym:强化学习环境标准框架,提供环境创建、交互接口
from gym.envs.registration import register # Gym环境注册函数,用于自定义环境注册
import numpy as np # 数值计算库,用于棋盘状态统计(如得分计算)

# 导入自定义模块
from gym.envs.reversi.reversi import ReversiEnv # 自定义黑白棋环境类(实现棋盘规则、状态管理等)
from RL_QG_agent import RL_QG_agent # 自定义强化学习智能体类(实现Q学习/其他RL算法)

# ==============================================
# 第一步:注册自定义黑白棋环境
# Gym要求自定义环境必须先注册,才能通过gym.make()创建实例
# ==============================================
register(
id='Reversi8x8-v0', # 环境唯一标识符(后续创建环境时使用)
entry_point='gym.envs.reversi.reversi:ReversiEnv', # 环境类的路径(包.模块:类名)
kwargs={ # 传递给ReversiEnv类的初始化参数
'player_color': 'black', # 初始玩家颜色(黑棋先行)
'opponent': 'random', # 对手类型(此处为随机策略对手,即黑棋是随机玩家)
'observation_type': 'numpy3c', # 观测数据类型:3通道numpy数组(可能分别存储黑棋、白棋、空位置)
'illegal_place_mode': 'lose', # 非法落子处理方式:直接判负
'board_size': 8 # 棋盘尺寸(8x8标准黑白棋)
},
max_episode_steps=1000, # 每局最大步数限制(防止无限循环)
)

# 验证环境是否注册成功
envs = [spec.id for spec in gym.envs.registry.all()] # 获取所有已注册的环境ID列表
print("Reversi8x8-v0 是否注册成功:", 'Reversi8x8-v0' in envs) # 打印注册结果

# ==============================================
# 第二步:创建黑白棋环境实例
# 基于已注册的环境ID,创建可交互的环境对象
# ==============================================
env = gym.make(
'Reversi8x8-v0', # 目标环境ID(必须与注册时一致)
player_color='black', # 覆盖注册时的参数:初始玩家为黑棋
opponent='random', # 覆盖注册时的参数:对手为随机策略
observation_type='numpy3c', # 观测类型:3通道numpy数组
illegal_place_mode='lose' # 非法落子直接判负
)

# ==============================================
# 第三步:初始化强化学习智能体(白棋玩家)
# ==============================================
agent = RL_QG_agent() # 实例化自定义RL智能体(控制白棋)
agent.init_model() # 初始化智能体的模型(如Q表、神经网络等)
agent.load_model() # 加载预训练模型(若存在,可基于历史模型继续训练)

# ==============================================
# 第四步:设置训练参数
# ==============================================
max_epochs = 100 # 训练总局数(可根据需求调整,如1000局、10000局)
render_interval = 10 # 渲染间隔:每1局渲染一次棋盘(便于可视化训练过程)

# ==============================================
# 第五步:训练主循环(核心逻辑)
# 外层循环:每一局对战;内层循环:每一步落子(黑棋→白棋交替)
# ==============================================
for i_episode in range(max_epochs):
# 重置环境:开始新一局,返回初始观测(初始棋盘状态)
observation = env.reset()

# 每局内部的步数循环(最大100步,防止超时)
for t in range(100):
################### 黑棋回合(随机策略玩家) ###################
# 按渲染间隔判断是否渲染棋盘(此处每局都渲染)
if i_episode % render_interval == 0:
env.render() # 可视化棋盘状态(显示当前落子位置、棋盘布局)

enables = env.possible_actions # 获取黑棋当前的所有合法落子位置(列表形式)

# 黑棋选择动作:无合法动作则"pass"(跳过回合)
if len(enables) == 0:
# 动作编码:棋盘大小的平方+1 表示pass(8x8棋盘→64+1=65为pass动作)
action_black = env.board_size**2 + 1
else:
action_black = random.choice(enables) # 有合法动作时,随机选择一个落子

# 执行黑棋动作:将选择的动作传入环境,获取反馈
# observation:执行动作后的新棋盘状态(观测)
# reward:动作带来的即时奖励(此处未显式使用,可能在智能体训练中用到)
# done:是否结束当前局(True=游戏结束,False=继续)
# info:额外信息(如落子是否合法、当前玩家等,可选)
observation, reward, done, info = env.step(action_black)

if done: # 若黑棋动作后游戏结束(如双方都pass或棋盘满),跳出步数循环
break

################### 白棋回合(强化学习智能体) ###################
# 渲染棋盘(与黑棋回合一致,每局都显示)
if i_episode % render_interval == 0:
env.render()

enables = env.possible_actions # 获取白棋当前的所有合法落子位置

# 智能体选择动作:无合法动作则"pass"
if not enables: # 等价于 len(enables) == 0
action_white = env.board_size ** 2 + 1 # pass动作编码
import numpy as np


def _env_int(name, default, minimum=1):
"""Read integer environment variables safely."""
raw = os.getenv(name, str(default))
try:
value = int(raw)
except (TypeError, ValueError):
value = default
return max(value, minimum)


def _save_report(report_path, payload):
"""Save JSON report."""
out_path = Path(report_path)
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=2)
print(f"[INFO] report saved: {out_path.resolve()}")


# Configurable parameters
SEED = _env_int("REVERSI_SEED", 42, minimum=0)
MAX_EPOCHS = _env_int("REVERSI_MAX_EPOCHS", 100, minimum=1)
RENDER_INTERVAL = _env_int("REVERSI_RENDER_INTERVAL", 10, minimum=1)
MAX_STEPS_PER_EPISODE = _env_int("REVERSI_MAX_STEPS", 100, minimum=1)
REPORT_OUT = os.getenv("REVERSI_REPORT_OUT", "outputs/reversi_train_report.json")
DRY_RUN = os.getenv("REVERSI_DRY_RUN", "0").strip() == "1"

random.seed(SEED)
np.random.seed(SEED)


if DRY_RUN:
_save_report(
REPORT_OUT,
{
"dry_run": True,
"config": {
"seed": SEED,
"max_epochs": MAX_EPOCHS,
"render_interval": RENDER_INTERVAL,
"max_steps_per_episode": MAX_STEPS_PER_EPISODE,
},
"note": "DRY_RUN enabled, training loop skipped.",
},
)
raise SystemExit(0)


import gym
from gym.envs.registration import register

from RL_QG_agent import RL_QG_agent


ENV_ID = "Reversi8x8-v0"


def _ensure_registered():
"""Register env once."""
env_ids = [spec.id for spec in gym.envs.registry.all()]
if ENV_ID in env_ids:
return
register(
id=ENV_ID,
entry_point="gym.envs.reversi.reversi:ReversiEnv",
kwargs={
"player_color": "black",
"opponent": "random",
"observation_type": "numpy3c",
"illegal_place_mode": "lose",
"board_size": 8,
},
max_episode_steps=1000,
)


def run_training():
_ensure_registered()
env = gym.make(
ENV_ID,
player_color="black",
opponent="random",
observation_type="numpy3c",
illegal_place_mode="lose",
)

agent = RL_QG_agent()
agent.init_model()
agent.load_model()

print(f"[INFO] {ENV_ID} registered: True")
print(
"[INFO] config:",
{
"seed": SEED,
"max_epochs": MAX_EPOCHS,
"render_interval": RENDER_INTERVAL,
"max_steps_per_episode": MAX_STEPS_PER_EPISODE,
"report_out": REPORT_OUT,
},
)

episode_summaries = []
for i_episode in range(MAX_EPOCHS):
observation = env.reset()

for t in range(MAX_STEPS_PER_EPISODE):
if i_episode % RENDER_INTERVAL == 0:
env.render()

enables = env.possible_actions
if len(enables) == 0:
action_black = env.board_size**2 + 1
else:
action_black = random.choice(enables)

observation, reward, done, info = env.step(action_black)
if done:
break

if i_episode % RENDER_INTERVAL == 0:
env.render()

enables = env.possible_actions
if not enables:
action_white = env.board_size**2 + 1
else:
action_white = agent.place(observation, enables)

observation, reward, done, info = env.step(action_white)
if done:
break

black_score = int(np.sum(env.board == 1))
white_score = int(np.sum(env.board == -1))

if black_score > white_score:
winner = "black"
elif black_score < white_score:
winner = "white"
else:
# 智能体根据当前观测(棋盘状态)和合法动作,选择最优落子
action_white = agent.place(observation, enables)

# 执行白棋动作:获取环境反馈
observation, reward, done, info = env.step(action_white)

if done: # 若白棋动作后游戏结束,跳出步数循环
break

# ==============================================
# 每局结束后:统计结果并打印
# ==============================================
print(f"\n第 {i_episode+1} 局结束,总步数:{t+1}") # t从0开始,需+1表示实际步数

# 计算得分:棋盘上1代表黑棋,-1代表白棋,求和得到各自棋子数量
black_score = np.sum(env.board == 1) # 黑棋得分(棋子数)
white_score = np.sum(env.board == -1) # 白棋得分(棋子数)
print(f"黑棋(随机策略):{black_score} 子,白棋(RL智能体):{white_score} 子")

# 判断胜负结果
if black_score > white_score:
print("本局结果:黑棋获胜!")
elif black_score < white_score:
print("本局结果:白棋获胜!")
else:
print("本局结果:平局!")

# ==============================================
# 第六步:训练结束后处理
# ==============================================
agent.save_model() # 保存训练后的智能体模型(覆盖原有模型或保存为新文件)
env.close() # 关闭环境,释放资源(如渲染窗口、内存等)
print(f"\n训练完成!共进行 {max_epochs} 局对战")
winner = "draw"

print(
f"Episode {i_episode + 1}/{MAX_EPOCHS}, steps={t + 1}, "
f"black={black_score}, white={white_score}, winner={winner}"
)

episode_summaries.append(
{
"episode": i_episode + 1,
"steps": t + 1,
"black_score": black_score,
"white_score": white_score,
"winner": winner,
}
)

agent.save_model()
env.close()

_save_report(
REPORT_OUT,
{
"dry_run": False,
"config": {
"seed": SEED,
"max_epochs": MAX_EPOCHS,
"render_interval": RENDER_INTERVAL,
"max_steps_per_episode": MAX_STEPS_PER_EPISODE,
},
"episodes": episode_summaries,
"final_episode": episode_summaries[-1] if episode_summaries else None,
},
)


if __name__ == "__main__":
run_training()