From 2d60185fb45469b4db7ae38d66005fe18fe78c5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?PyTorch=5F=EB=A9=98=ED=8B=B0=5F=EA=B0=95=EC=A0=95=EB=AF=BC?=
 <gangjeongmin23@gmail.com>
Date: Sat, 16 May 2026 17:40:30 +0900
Subject: [PATCH] =?UTF-8?q?translate:=20intermediate=5Fsource/dqn=5Fwith?=
 =?UTF-8?q?=5Frnn=5Ftutorial.py=20=EB=B2=88=EC=97=AD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 intermediate_source/dqn_with_rnn_tutorial.py | 350 +++++++++----------
 1 file changed, 167 insertions(+), 183 deletions(-)

diff --git a/intermediate_source/dqn_with_rnn_tutorial.py b/intermediate_source/dqn_with_rnn_tutorial.py
index bcc484f0a..63f0bcb81 100644
--- a/intermediate_source/dqn_with_rnn_tutorial.py
+++ b/intermediate_source/dqn_with_rnn_tutorial.py
@@ -1,20 +1,20 @@
 # -*- coding: utf-8 -*-
 
 """
-Recurrent DQN: Training recurrent policies
+순환 DQN: 순환 정책 학습하기
 ==========================================
 
-**Author**: `Vincent Moens <https://github.com/vmoens>`_
+**저자**: `Vincent Moens <https://github.com/vmoens>`_
 
 .. grid:: 2
 
-    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+    .. grid-item-card:: :octicon:`mortar-board;1em;` 배울 내용
        :class-card: card-prerequisites
 
-       * How to incorporating an RNN in an actor in TorchRL
-       * How to use that memory-based policy with a replay buffer and a loss module
+       * TorchRL에서 액터에 RNN을 통합하는 방법
+       * 메모리 기반 정책을 리플레이 버퍼 및 손실 모듈과 함께 사용하는 방법
 
-    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+    .. grid-item-card:: :octicon:`list-unordered;1em;` 사전 준비 사항
        :class-card: card-prerequisites
 
        * PyTorch v2.0.0
@@ -23,42 +23,40 @@
 """
 
 #########################################################################
-# Overview
+# 개요
 # --------
 #
-# Memory-based policies are crucial not only when the observations are partially
-# observable but also when the time dimension must be taken into account to
-# make informed decisions.
+# 메모리 기반 정책(policy)은 관측이 부분적으로만 가능한 경우뿐만 아니라
+# 의사결정을 위해 시간 차원을 고려해야 하는 경우에도 매우 중요합니다.
 #
-# Recurrent neural network have long been a popular tool for memory-based
-# policies. The idea is to keep a recurrent state in memory between two
-# consecutive steps, and use this as an input to the policy along with the
-# current observation.
+# 순환 신경망(Recurrent Neural Network)은 오랫동안 메모리 기반 정책에 널리 사용되어
+# 왔습니다. 핵심 아이디어는 두 연속 단계(step) 사이에 순환 상태(recurrent state)를
+# 메모리에 유지하고, 현재 관측과 함께 정책의 입력으로 사용하는 것입니다.
 #
-# This tutorial shows how to incorporate an RNN in a policy using TorchRL.
+# 이 튜토리얼에서는 TorchRL을 사용하여 정책에 RNN을 통합하는 방법을 보여줍니다.
 #
-# Key learnings:
+# 핵심 학습 내용:
 #
-# - Incorporating an RNN in an actor in TorchRL;
-# - Using that memory-based policy with a replay buffer and a loss module.
+# - TorchRL에서 액터에 RNN 통합하기
+# - 메모리 기반 정책을 리플레이 버퍼 및 손실 모듈과 함께 사용하기
 #
-# The core idea of using RNNs in TorchRL is to use TensorDict as a data carrier
-# for the hidden states from one step to another. We'll build a policy that
-# reads the previous recurrent state from the current TensorDict, and writes the
-# current recurrent states in the TensorDict of the next state:
+# TorchRL에서 RNN을 사용하는 핵심 아이디어는 TensorDict를 한 단계에서 다음 단계로
+# 은닉 상태(hidden state)를 전달하는 데이터 운반체로 사용하는 것입니다. 이전
+# 순환 상태를 현재 TensorDict에서 읽고, 현재 순환 상태를 다음 상태의
+# TensorDict에 기록하는 정책을 구성합니다.
 #
 # .. figure:: /_static/img/rollout_recurrent.png
-#    :alt: Data collection with a recurrent policy
+#    :alt: 순환 정책을 사용한 데이터 수집
 #
-# As this figure shows, our environment populates the TensorDict with zeroed recurrent
-# states which are read by the policy together with the observation to produce an
-# action, and recurrent states that will be used for the next step.
-# When the :func:`~torchrl.envs.utils.step_mdp` function is called, the recurrent states
-# from the next state are brought to the current TensorDict. Let's see how this
-# is implemented in practice.
+# 이 그림에서 보듯이 환경은 TensorDict에 0으로 초기화된 순환 상태를 채우고,
+# 정책은 이를 관측과 함께 읽어 행동과 다음 단계에 사용할 순환 상태를
+# 생성합니다.
+# :func:`~torchrl.envs.utils.step_mdp` 함수가 호출되면 다음 상태의 순환 상태가
+# 현재 TensorDict로 가져옵니다. 이것이 실제로 어떻게 구현되는지
+# 살펴보겠습니다.
 
 ######################################################################
-# If you are running this in Google Colab, make sure you install the following dependencies:
+# Google Colab에서 실행하는 경우 다음 의존성을 설치해야 합니다.
 #
 # .. code-block:: bash
 #
@@ -66,7 +64,7 @@
 #    !pip3 install gym[mujoco]
 #    !pip3 install tqdm
 #
-# Setup
+# 설정
 # -----
 #
 
@@ -76,9 +74,9 @@
 warnings.filterwarnings("ignore")
 from torch import multiprocessing
 
-# TorchRL prefers spawn method, that restricts creation of  ``~torchrl.envs.ParallelEnv`` inside
-# `__main__` method call, but for the easy of reading the code switch to fork
-# which is also a default spawn method in Google's Colaboratory
+# TorchRL은 spawn 메소드를 선호하며, ``~torchrl.envs.ParallelEnv`` 생성을
+# `__main__` 메소드 호출 내부로 제한하지만, 코드의 가독성을 위해 fork로 전환합니다.
+# fork는 Google Colaboratory에서도 기본 spawn 메소드입니다.
 try:
     multiprocessing.set_start_method("fork")
 except RuntimeError:
@@ -117,42 +115,37 @@
 )
 
 ######################################################################
-# Environment
+# 환경
 # -----------
 #
-# As usual, the first step is to build our environment: it helps us
-# define the problem and build the policy network accordingly. For this tutorial,
-# we'll be running a single pixel-based instance of the CartPole gym
-# environment with some custom transforms: turning to grayscale, resizing to
-# 84x84, scaling down the rewards and normalizing the observations.
+# 먼저 환경을 구성합니다. 이를 통해 문제를 정의하고 그에 맞는 정책 네트워크를
+# 구성할 수 있습니다. 이 튜토리얼에서는 CartPole gym 환경의 단일 픽셀 기반
+# 인스턴스 를 사용하며, 몇 가지 사용자 정의 변환(transform)을 적용합니다.
+# 그레이스케일 변환, 84x84 크기 변경, 보상 스케일링, 관측 정규화 등을 수행합니다.
 #
 # .. note::
-#   The :class:`~torchrl.envs.transforms.StepCounter` transform is accessory. Since the CartPole
-#   task goal is to make trajectories as long as possible, counting the steps
-#   can help us track the performance of our policy.
-#
-# Two transforms are important for the purpose of this tutorial:
-#
-# - :class:`~torchrl.envs.transforms.InitTracker` will stamp the
-#   calls to :meth:`~torchrl.envs.EnvBase.reset` by adding a ``"is_init"``
-#   boolean mask in the TensorDict that will track which steps require a reset
-#   of the RNN hidden states.
-# - The :class:`~torchrl.envs.transforms.TensorDictPrimer` transform is a bit more
-#   technical. It is not required to use RNN policies. However, it
-#   instructs the environment (and subsequently the collector) that some extra
-#   keys are to be expected. Once added, a call to `env.reset()` will populate
-#   the entries indicated in the primer with zeroed tensors. Knowing that
-#   these tensors are expected by the policy, the collector will pass them on
-#   during collection. Eventually, we'll be storing our hidden states in the
-#   replay buffer, which will help us bootstrap the computation of the
-#   RNN operations in the loss module (which would otherwise be initiated
-#   with 0s). In summary: not including this transform will not impact hugely
-#   the training of our policy, but it will make the recurrent keys disappear
-#   from the collected data and the replay buffer, which will in turn lead to
-#   a slightly less optimal training.
-#   Fortunately, the :class:`~torchrl.modules.LSTMModule` we propose is
-#   equipped with a helper method to build just that transform for us, so
-#   we can wait until we build it!
+#   :class:`~torchrl.envs.transforms.StepCounter` 변환은 보조적입니다. CartPole
+#   태스크의 목표는 궤적(trajectory)을 가능한 한 길게 만드는 것이므로, 단계를 세는 것이
+#   정책의 성능을 추적하는 데 도움이 됩니다.
+#
+# 이 튜토리얼의 목적에서 중요한 두 가지 변환이 있습니다.
+#
+# - :class:`~torchrl.envs.transforms.InitTracker` 는
+#   :meth:`~torchrl.envs.EnvBase.reset` 호출을 표시하여 TensorDict에
+#   ``"is_init"`` 불리언 마스크를 추가합니다. 이를 통해 RNN 은닉 상태를
+#   초기화해야 하는 단계를 추적합니다.
+# - :class:`~torchrl.envs.transforms.TensorDictPrimer` 변환은 좀 더 기술적입니다.
+#   RNN 정책을 사용하는 데 반드시 필요하지는 않습니다. 그러나 환경(및 이후의
+#   수집기)에 추가 키가 필요함을 알려줍니다. 추가되면 ``env.reset()`` 호출 시
+#   프라이머에 지정된 항목이 0으로 초기화된 텐서(Tensor)로 채워집니다. 이 텐서가
+#   정책에 필요하다는 것을 알고 있으므로 수집기는 수집 과정에서 이를 전달합니다.
+#   결국 은닉 상태를 리플레이 버퍼에 저장하게 되며, 이는 손실 모듈에서
+#   RNN 연산의 부트스트랩 계산에 도움이 됩니다(그렇지 않으면 0으로 초기화됩니다).
+#   요약하면 이 변환을 포함하지 않아도 정책 학습에 큰 영향은 없지만, 수집된
+#   데이터와 리플레이 버퍼에서 순환 키가 사라지게 되어 학습이 다소 최적에 미치지
+#   못할 수 있습니다.
+#   다행히 :class:`~torchrl.modules.LSTMModule` 은 이 변환을 자동으로
+#   생성하는 헬퍼 메소드를 제공하므로, 모듈을 구성한 후에 사용하면 됩니다.
 #
 
 env = TransformedEnv(
@@ -169,26 +162,25 @@
 )
 
 ######################################################################
-# As always, we need to initialize manually our normalization constants:
+# 항상 그렇듯이 정규화 상수를 수동으로 초기화해야 합니다.
 #
 env.transform[-1].init_stats(1000, reduce_dim=[0, 1, 2], cat_dim=0, keep_dims=[0])
 td = env.reset()
 
 ######################################################################
-# Policy
+# 정책
 # ------
 #
-# Our policy will have 3 components: a :class:`~torchrl.modules.ConvNet`
-# backbone, an :class:`~torchrl.modules.LSTMModule` memory layer and a shallow
-# :class:`~torchrl.modules.MLP` block that will map the LSTM output onto the
-# action values.
+# 정책은 3개의 구성 요소로 이루어집니다. :class:`~torchrl.modules.ConvNet`
+# 백본, :class:`~torchrl.modules.LSTMModule` 메모리 계층, 그리고 LSTM 출력을
+# 행동 가치(action value)에 매핑하는 얕은 :class:`~torchrl.modules.MLP` 블록입니다.
 #
-# Convolutional network
+# 합성곱 네트워크
 # ~~~~~~~~~~~~~~~~~~~~~
 #
-# We build a convolutional network flanked with a :class:`torch.nn.AdaptiveAvgPool2d`
-# that will squash the output in a vector of size 64. The :class:`~torchrl.modules.ConvNet`
-# can assist us with this:
+# 출력 을 크기 64의 벡터로 압축하는 :class:`torch.nn.AdaptiveAvgPool2d` 를 포함한
+# 합성곱 네트워크를 구성합니다. :class:`~torchrl.modules.ConvNet` 이 이를
+# 지원합니다.
 #
 
 feature = Mod(
@@ -203,33 +195,30 @@
     out_keys=["embed"],
 )
 ######################################################################
-# we execute the first module on a batch of data to gather the size of the
-# output vector:
+# 출력 벡터의 크기를 얻기 위해 첫 번째 모듈을 데이터 배치에 대해 실행합니다.
 #
 n_cells = feature(env.reset())["embed"].shape[-1]
 
 ######################################################################
-# LSTM Module
+# LSTM 모듈
 # ~~~~~~~~~~~
 #
-# TorchRL provides a specialized :class:`~torchrl.modules.LSTMModule` class
-# to incorporate LSTMs in your code-base. It is a :class:`~tensordict.nn.TensorDictModuleBase`
-# subclass: as such, it has a set of ``in_keys`` and ``out_keys`` that indicate
-# what values should be expected to be read and written/updated during the
-# execution of the module. The class comes with customizable predefined
-# values for these attributes to facilitate its construction.
+# TorchRL은 코드에 LSTM을 통합하기 위한 전용 :class:`~torchrl.modules.LSTMModule`
+# 클래스를 제공합니다. 이 클래스는 :class:`~tensordict.nn.TensorDictModuleBase` 의
+# 하위 클래스로, ``in_keys`` 와 ``out_keys`` 집합을 가지고 있어 모듈 실행 중
+# 읽고 쓸/갱신할 값을 나타냅니다. 이 클래스는 생성을 쉽게 하기 위해
+# 사전 정의된 기본값을 가지고 있습니다.
 #
 # .. note::
-#   *Usage limitations*: The class supports almost all LSTM features such as
-#   dropout or multi-layered LSTMs.
-#   However, to respect TorchRL's conventions, this LSTM must have the ``batch_first``
-#   attribute set to ``True`` which is **not** the default in PyTorch. However,
-#   our :class:`~torchrl.modules.LSTMModule` changes this default
-#   behavior, so we're good with a native call.
+#   *사용 제한 사항*: 이 클래스는 드롭아웃(Dropout)이나 다중 계층 LSTM 등
+#   대부분의 LSTM 기능을 지원합니다.
+#   그러나 TorchRL의 규칙을 준수하기 위해 이 LSTM은 ``batch_first``
+#   속성이 ``True`` 로 설정되어야 하며, 이는 PyTorch의 기본값이 **아닙니다**. 그러나
+#   :class:`~torchrl.modules.LSTMModule` 은 이 기본 동작을 변경하므로
+#   기본 호출만으로 충분합니다.
 #
-#   Also, the LSTM cannot have a ``bidirectional`` attribute set to ``True`` as
-#   this wouldn't be usable in online settings. In this case, the default value
-#   is the correct one.
+#   또한 LSTM의 ``bidirectional`` 속성이 ``True`` 로 설정되면 온라인 환경에서
+#   사용할 수 없으므로, 기본값 그대로 사용합니다.
 #
 
 lstm = LSTMModule(
@@ -241,37 +230,34 @@
 )
 
 ######################################################################
-# Let us look at the LSTM Module class, specifically its in and out_keys:
+# LSTM 모듈 클래스의 in_keys와 out_keys를 살펴보겠습니다.
 print("in_keys", lstm.in_keys)
 print("out_keys", lstm.out_keys)
 
 ######################################################################
-# We can see that these values contain the key we indicated as the in_key (and out_key)
-# as well as recurrent key names. The out_keys are preceded by a "next" prefix
-# that indicates that they will need to be written in the "next" TensorDict.
-# We use this convention (which can be overridden by passing the in_keys/out_keys
-# arguments) to make sure that a call to :func:`~torchrl.envs.utils.step_mdp` will
-# move the recurrent state to the root TensorDict, making it available to the
-# RNN during the following call (see figure in the intro).
-#
-# As mentioned earlier, we have one more optional transform to add to our
-# environment to make sure that the recurrent states are passed to the buffer.
-# The :meth:`~torchrl.modules.LSTMModule.make_tensordict_primer` method does
-# exactly that:
+# 이 값에는 in_key(및 out_key)로 지정한 키와 순환 키 이름이 포함되어
+# 있습니다. out_keys 앞에는 "next" 접두사가 붙어 있어 "next" TensorDict에
+# 기록해야 함을 나타냅니다.
+# 이 규칙(in_keys/out_keys 인자를 전달하여 재정의할 수 있음)을 사용하면
+# :func:`~torchrl.envs.utils.step_mdp` 호출 시 순환 상태가 루트 TensorDict로
+# 이동하여, 다음 호출에서 RNN이 이를 사용할 수 있게 됩니다(도입부의 그림 참조).
+#
+# 앞서 언급한 대로 순환 상태가 버퍼에 전달되도록 환경에 선택적 변환을
+# 하나 더 추가해야 합니다.
+# :meth:`~torchrl.modules.LSTMModule.make_tensordict_primer` 메소드가
+# 이를 정확히 수행합니다.
 #
 env.append_transform(lstm.make_tensordict_primer())
 
 ######################################################################
-# and that's it! We can print the environment to check that everything looks good now
-# that we have added the primer:
+# 프라이머를 추가한 후 환경을 출력하여 모든 것이 올바른지 확인합니다.
 print(env)
 
 ######################################################################
 # MLP
 # ~~~
 #
-# We use a single-layer MLP to represent the action values we'll be using for
-# our policy.
+# 정책에 사용할 행동 가치를 나타내기 위해 단일 계층 MLP를 사용합니다.
 #
 mlp = MLP(
     out_features=2,
@@ -281,43 +267,44 @@
     device=device,
 )
 ######################################################################
-# and fill the bias with zeros:
+# 편향(bias)을 0으로 채웁니다.
 
 mlp[-1].bias.data.fill_(0.0)
 mlp = Mod(mlp, in_keys=["embed"], out_keys=["action_value"])
 
 ######################################################################
-# Using the Q-Values to select an action
+# Q-값을 사용한 행동 선택
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# The last part of our policy is the Q-Value Module.
-# The Q-Value module :class:`~torchrl.modules.tensordict_module.QValueModule`
-# will read the ``"action_values"`` key that is produced by our MLP and
-# from it, gather the action that has the maximum value.
-# The only thing we need to do is to specify the action space, which can be done
-# either by passing a string or an action-spec. This allows us to use
-# Categorical (sometimes called "sparse") encoding or the one-hot version of it.
+# 정책의 마지막 부분은 Q-값 모듈입니다.
+# Q-값 모듈 :class:`~torchrl.modules.tensordict_module.QValueModule` 은
+# MLP가 생성한 ``"action_values"`` 키를 읽고, 가장 높은 값을 가진 행동을
+# 선택합니다.
+# 행동 공간(action space)만 지정하면 되며, 문자열 또는 action-spec을 전달하여
+# 지정할 수 있습니다. 이를 통해 범주형(Categorical, 때때로 "sparse"라고도 함)
+# 인코딩 또는 원-핫(One-Hot) 버전을 사용할 수 있습니다.
 #
 qval = QValueModule(spec=env.action_spec)
 
 ######################################################################
 # .. note::
-#   TorchRL also provides a wrapper class :class:`torchrl.modules.QValueActor` that
-#   wraps a module in a Sequential together with a :class:`~torchrl.modules.tensordict_module.QValueModule`
-#   like we are doing explicitly here. There is little advantage to do this
-#   and the process is less transparent, but the end results will be similar to
-#   what we do here.
+#   TorchRL은 래퍼 클래스 :class:`torchrl.modules.QValueActor` 도 제공합니다.
+#   이 클래스는 여기서 명시적으로 하는 것처럼 모듈을 Sequential과
+#   :class:`~torchrl.modules.tensordict_module.QValueModule` 로 감쌉니다.
+#   이렇게 하는 것에 큰 이점은 없고 과정이 덜 투명하지만, 최종 결과는
+#   여기서 수행하는 것과 유사합니다.
 #
-# We can now put things together in a :class:`~tensordict.nn.TensorDictSequential`
+# 이제 :class:`~tensordict.nn.TensorDictSequential` 로 구성 요소를 조합할 수
+# 있습니다.
 #
 stoch_policy = Seq(feature, lstm, mlp, qval)
 
 ######################################################################
-# DQN being a deterministic algorithm, exploration is a crucial part of it.
-# We'll be using an :math:`\epsilon`-greedy policy with an epsilon of 0.2 decaying
-# progressively to 0.
-# This decay is achieved via a call to :meth:`~torchrl.modules.EGreedyModule.step`
-# (see training loop below).
+# DQN은 결정적(deterministic) 알고리즘이므로 탐색(exploration)이 매우 중요합니다.
+# 초기값 0.2에서 점진적으로 0으로 감소하는 :math:`\epsilon`-탐욕(greedy) 정책을
+# 사용합니다.
+# 이 감소는 :meth:`~torchrl.modules.EGreedyModule.step` 호출을 통해
+# 이루어집니다(아래 학습 루프 참조).
 #
 exploration_module = EGreedyModule(
     annealing_num_steps=1_000_000, spec=env.action_spec, eps_init=0.2
@@ -328,66 +315,64 @@
 )
 
 ######################################################################
-# Using the model for the loss
+# 손실에 모델 사용하기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# The model as we've built it is well equipped to be used in sequential settings.
-# However, the class :class:`torch.nn.LSTM` can use a cuDNN-optimized backend
-# to run the RNN sequence faster on GPU device. We would not want to miss
-# such an opportunity to speed up our training loop!
-# To use it, we just need to tell the LSTM module to run on "recurrent-mode"
-# when used by the loss.
-# As we'll usually want to have two copies of the LSTM module, we do this by
-# calling a :meth:`~torchrl.modules.LSTMModule.set_recurrent_mode` method that
-# will return a new instance of the LSTM (with shared weights) that will
-# assume that the input data is sequential in nature.
+# 지금까지 구성한 모델은 순차적 설정에서 사용하기에 적합합니다.
+# 그러나 :class:`torch.nn.LSTM` 클래스는 GPU 장치에서 RNN 시퀀스를 더 빠르게
+# 실행하기 위해 cuDNN 최적화 백엔드를 사용할 수 있습니다. 학습 루프를
+# 가속할 수 있는 기회를 놓치지 않겠습니다.
+# 이를 위해 손실에서 사용할 때 LSTM 모듈을 "recurrent-mode"로 실행하도록
+# 지정하면 됩니다.
+# 일반적으로 LSTM 모듈의 복사본 2개를 갖게 되므로,
+# :meth:`~torchrl.modules.LSTMModule.set_recurrent_mode` 메소드를 호출하여
+# 입력 데이터가 순차적임을 가정하는 새 인스턴스(가중치 공유)를 반환합니다.
 #
 policy = Seq(feature, lstm.set_recurrent_mode(True), mlp, qval)
 
 ######################################################################
-# Because we still have a couple of uninitialized parameters we should
-# initialize them before creating an optimizer and such.
+# 아직 초기화되지 않은 매개변수(parameter)가 몇 개 있으므로
+# 옵티마이저(Optimizer) 등을 생성하기 전에 초기화해야 합니다.
 #
 policy(env.reset())
 
 ######################################################################
-# DQN Loss
+# DQN 손실
 # --------
 #
-# Out DQN loss requires us to pass the policy and, again, the action-space.
-# While this may seem redundant, it is important as we want to make sure that
-# the :class:`~torchrl.objectives.DQNLoss` and the :class:`~torchrl.modules.tensordict_module.QValueModule`
-# classes are compatible, but aren't strongly dependent on each other.
+# DQN 손실(loss)에는 정책과 행동 공간을 전달해야 합니다.
+# 이것이 중복으로 보일 수 있지만, :class:`~torchrl.objectives.DQNLoss` 와
+# :class:`~torchrl.modules.tensordict_module.QValueModule` 클래스가
+# 호환되면서도 서로 강하게 의존하지 않도록 하기 위해 중요합니다.
 #
-# To use the Double-DQN, we ask for a ``delay_value`` argument that will
-# create a non-differentiable copy of the network parameters to be used
-# as a target network.
+# Double-DQN을 사용하기 위해 ``delay_value`` 인자를 요청하여
+# 타겟 네트워크로 사용할 미분 불가능한 네트워크 매개변수 복사본을 생성합니다.
 loss_fn = DQNLoss(policy, action_space=env.action_spec, delay_value=True)
 
 ######################################################################
-# Since we are using a double DQN, we need to update the target parameters.
-# We'll use a  :class:`~torchrl.objectives.SoftUpdate` instance to carry out
-# this work.
+# Double DQN을 사용하고 있으므로 타겟 매개변수를 갱신해야 합니다.
+# :class:`~torchrl.objectives.SoftUpdate` 인스턴스 를 사용하여 이 작업을
+# 수행합니다.
 #
 updater = SoftUpdate(loss_fn, eps=0.95)
 
 optim = torch.optim.Adam(policy.parameters(), lr=3e-4)
 
 ######################################################################
-# Collector and replay buffer
+# 수집기와 리플레이 버퍼
 # ---------------------------
 #
-# We build the simplest data collector there is. We'll try to train our algorithm
-# with a million frames, extending the buffer with 50 frames at a time. The buffer
-# will be designed to store 20 thousands trajectories of 50 steps each.
-# At each optimization step (16 per data collection), we'll collect 4 items
-# from our buffer, for a total of 200 transitions.
-# We'll use a :class:`~torchrl.data.replay_buffers.LazyMemmapStorage` storage to keep the data
-# on disk.
+# 가장 단순한 데이터 수집기를 구성합니다. 총 백만 프레임으로 알고리즘을
+# 학습하며, 한 번에 50 프레임씩 버퍼를 확장합니다. 버퍼는 50 단계의
+# 궤적 2만 개를 저장하도록 설계됩니다.
+# 각 최적화 단계(데이터 수집당 16회)에서 버퍼로부터 4개 항목을 추출하여
+# 총 200개의 전이(transition)를 처리합니다.
+# 데이터를 디스크에 유지하기 위해 :class:`~torchrl.data.replay_buffers.LazyMemmapStorage`
+# 스토리지를 사용합니다.
 #
 # .. note::
-#   For the sake of efficiency, we're only running a few thousands iterations
-#   here. In a real setting, the total number of frames should be set to 1M.
+#   효율성을 위해 여기서는 수천 번의 반복만 실행합니다. 실제 환경에서는
+#   총 프레임 수를 100만으로 설정해야 합니다.
 #
 collector = SyncDataCollector(env, stoch_policy, frames_per_batch=50, total_frames=200, device=device)
 rb = TensorDictReplayBuffer(
@@ -395,11 +380,11 @@
 )
 
 ######################################################################
-# Training loop
+# 학습 루프
 # -------------
 #
-# To keep track of the progress, we will run the policy in the environment once
-# every 50 data collection, and plot the results after training.
+# 진행 상황을 추적하기 위해 50번의 데이터 수집마다 환경에서 정책을 한 번씩
+# 실행하고, 학습 후 결과를 도식화합니다.
 #
 
 utd = 16
@@ -418,7 +403,7 @@
             data,
         )
     pbar.update(data.numel())
-    # it is important to pass data that is not flattened
+    # 평탄화되지 않은 데이터를 전달하는 것이 중요합니다
     rb.extend(data.unsqueeze(0).to_tensordict().cpu())
     for _ in range(utd):
         s = rb.sample().to(device, non_blocking=True)
@@ -438,7 +423,7 @@
         traj_lens.append(rollout.get(("next", "step_count")).max().item())
 
 ######################################################################
-# Let's plot our results:
+# 결과를 도식화합니다.
 #
 if traj_lens:
     from matplotlib import pyplot as plt
@@ -448,21 +433,20 @@
     plt.title("Test trajectory lengths")
 
 ######################################################################
-# Conclusion
+# 결론
 # ----------
 #
-# We have seen how an RNN can be incorporated in a policy in TorchRL.
-# You should now be able:
+# TorchRL에서 정책에 RNN을 통합하는 방법을 살펴보았습니다.
+# 이제 다음을 수행할 수 있습니다.
 #
-# - Create an LSTM module that acts as a :class:`~tensordict.nn.TensorDictModule`
-# - Indicate to the LSTM module that a reset is needed via an :class:`~torchrl.envs.transforms.InitTracker`
-#   transform
-# - Incorporate this module in a policy and in a loss module
-# - Make sure that the collector is made aware of the recurrent state entries
-#   such that they can be stored in the replay buffer along with the rest of
-#   the data
+# - :class:`~tensordict.nn.TensorDictModule` 로 작동하는 LSTM 모듈 생성하기
+# - :class:`~torchrl.envs.transforms.InitTracker` 변환을 통해 LSTM 모듈에
+#   초기화가 필요함을 알리기
+# - 이 모듈을 정책과 손실 모듈에 통합하기
+# - 수집기가 순환 상태 항목을 인식하도록 하여 나머지 데이터와 함께
+#   리플레이 버퍼에 저장할 수 있도록 하기
 #
-# Further Reading
+# 추가 자료
 # ---------------
-# 
-# - The TorchRL documentation can be found `here <https://pytorch.org/rl/>`_.
+#
+# - TorchRL 문서는 `여기 <https://pytorch.org/rl/>`_ 에서 확인할 수 있습니다.
\ No newline at end of file