diff --git a/beginner_source/blitz/neural_networks_tutorial.py b/beginner_source/blitz/neural_networks_tutorial.py
index 23f50526e2..ae29411bec 100644
--- a/beginner_source/blitz/neural_networks_tutorial.py
+++ b/beginner_source/blitz/neural_networks_tutorial.py
@@ -45,13 +45,13 @@
 class Net(nn.Module):
 
     def __init__(self):
-        super(Net, self).__init__()
+        super().__init__()
         # 1 input image channel, 6 output channels, 5x5 square convolution
         # kernel
         self.conv1 = nn.Conv2d(1, 6, 5)
         self.conv2 = nn.Conv2d(6, 16, 5)
         # an affine operation: y = Wx + b
-        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension 
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
         self.fc2 = nn.Linear(120, 84)
         self.fc3 = nn.Linear(84, 10)
 
@@ -205,7 +205,9 @@ def forward(self, input):
 #
 #
 # Now we shall call ``loss.backward()``, and have a look at conv1's bias
-# gradients before and after the backward.
+# gradients before and after the backward. Since we have not introduced an
+# optimizer yet, we clear the gradients directly on the model. Once using an
+# optimizer, prefer ``optimizer.zero_grad()`` as shown below.
 
 
 net.zero_grad()     # zeroes the gradient buffers of all parameters
@@ -246,7 +248,8 @@ def forward(self, input):
 #
 #     learning_rate = 0.01
 #     for f in net.parameters():
-#         f.data.sub_(f.grad.data * learning_rate)
+#         with torch.no_grad():
+#             f -= f.grad * learning_rate
 #
 # However, as you use neural networks, you want to use various different
 # update rules such as SGD, Nesterov-SGD, Adam, RMSProp, etc.