diff --git a/beginner_source/introyt/autogradyt_tutorial.py b/beginner_source/introyt/autogradyt_tutorial.py
index abf75a7d266..6320af08672 100644
--- a/beginner_source/introyt/autogradyt_tutorial.py
+++ b/beginner_source/introyt/autogradyt_tutorial.py
@@ -119,7 +119,6 @@
 import matplotlib.ticker as ticker
 import math
 
-
 #########################################################################
 # Next, we’ll create an input tensor full of evenly spaced values on the
 # interval :math:`[0, 2{\pi}]`, and specify ``requires_grad=True``. (Like
@@ -127,16 +126,16 @@
 # optional ``requires_grad`` option.) Setting this flag means that in
 # every computation that follows, autograd will be accumulating the
 # history of the computation in the output tensors of that computation.
-# 
+#
 
-a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
+a = torch.linspace(0.0, 2.0 * math.pi, steps=25, requires_grad=True)
 print(a)
 
 
 ########################################################################
 # Next, we’ll perform a computation, and plot its output in terms of its
 # inputs:
-# 
+#
 
 b = torch.sin(a)
 plt.plot(a.detach(), b.detach())
@@ -145,7 +144,7 @@
 ########################################################################
 # Let’s have a closer look at the tensor ``b``. When we print it, we see
 # an indicator that it is tracking its computation history:
-# 
+#
 
 print(b)
 
@@ -154,9 +153,9 @@
 # This ``grad_fn`` gives us a hint that when we execute the
 # backpropagation step and compute gradients, we’ll need to compute the
 # derivative of :math:`\sin(x)` for all this tensor’s inputs.
-# 
+#
 # Let’s perform some more computations:
-# 
+#
 
 c = 2 * b
 print(c)
@@ -170,7 +169,7 @@
 # ``.backward()`` on a tensor with no arguments, it expects the calling
 # tensor to contain only a single element, as is the case when computing a
 # loss function.
-# 
+#
 
 out = d.sum()
 print(out)
@@ -183,19 +182,24 @@
 # shows us the gradient functions for all the prior tensors. Note that
 # ``a.grad_fn`` is reported as ``None``, indicating that this was an input
 # to the function with no history of its own.
-# 
+#
 
-print('d:')
+print("d:")
 print(d.grad_fn)
 print(d.grad_fn.next_functions)
 print(d.grad_fn.next_functions[0][0].next_functions)
 print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions)
-print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions[0][0].next_functions)
-print('\nc:')
+print(
+    d.grad_fn.next_functions[0][0]
+    .next_functions[0][0]
+    .next_functions[0][0]
+    .next_functions
+)
+print("\nc:")
 print(c.grad_fn)
-print('\nb:')
+print("\nb:")
 print(b.grad_fn)
-print('\na:')
+print("\na:")
 print(a.grad_fn)
 
 
@@ -203,7 +207,7 @@
 # With all this machinery in place, how do we get derivatives out? You
 # call the ``backward()`` method on the output, and check the input’s
 # ``grad`` property to inspect the gradients:
-# 
+#
 
 out.backward()
 print(a.grad)
@@ -212,54 +216,56 @@
 
 #########################################################################
 # Recall the computation steps we took to get here:
-# 
+#
 # .. code-block:: python
-# 
+#
 #    a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
 #    b = torch.sin(a)
 #    c = 2 * b
 #    d = c + 1
 #    out = d.sum()
-# 
+#
 # Adding a constant, as we did to compute ``d``, does not change the
 # derivative. That leaves :math:`c = 2 * b = 2 * \sin(a)`, the derivative
 # of which should be :math:`2 * \cos(a)`. Looking at the graph above,
 # that’s just what we see.
-# 
+#
 # Be aware that only *leaf nodes* of the computation have their gradients
 # computed. If you tried, for example, ``print(c.grad)`` you’d get back
 # ``None``. In this simple example, only the input is a leaf node, so only
 # it has gradients computed.
-# 
+#
 # Autograd in Training
 # --------------------
-# 
+#
 # We’ve had a brief look at how autograd works, but how does it look when
 # it’s used for its intended purpose? Let’s define a small model and
 # examine how it changes after a single training batch. First, define a
 # few constants, our model, and some stand-ins for inputs and outputs:
-# 
+#
 
 BATCH_SIZE = 16
 DIM_IN = 1000
 HIDDEN_SIZE = 100
 DIM_OUT = 10
 
+
 class TinyModel(torch.nn.Module):
 
     def __init__(self):
         super(TinyModel, self).__init__()
-        
+
         self.layer1 = torch.nn.Linear(DIM_IN, HIDDEN_SIZE)
         self.relu = torch.nn.ReLU()
         self.layer2 = torch.nn.Linear(HIDDEN_SIZE, DIM_OUT)
-    
+
     def forward(self, x):
         x = self.layer1(x)
         x = self.relu(x)
         x = self.layer2(x)
         return x
-    
+
+
 some_input = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False)
 ideal_output = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False)
 
@@ -271,12 +277,12 @@ def forward(self, x):
 # ``requires_grad=True`` for the model’s layers. Within a subclass of
 # ``torch.nn.Module``, it’s assumed that we want to track gradients on the
 # layers’ weights for learning.
-# 
+#
 # If we look at the layers of the model, we can examine the values of the
 # weights, and verify that no gradients have been computed yet:
-# 
+#
 
-print(model.layer2.weight[0][0:10]) # just a small slice
+print(model.layer2.weight[0][0:10])  # just a small slice
 print(model.layer2.weight.grad)
 
 
@@ -285,7 +291,7 @@ def forward(self, x):
 # loss function, we’ll just use the square of the Euclidean distance
 # between our ``prediction`` and the ``ideal_output``, and we’ll use a
 # basic stochastic gradient descent optimizer.
-# 
+#
 
 optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
 
@@ -297,7 +303,7 @@ def forward(self, x):
 
 ######################################################################
 # Now, let’s call ``loss.backward()`` and see what happens:
-# 
+#
 
 loss.backward()
 print(model.layer2.weight[0][0:10])
@@ -309,7 +315,7 @@ def forward(self, x):
 # weight, but the weights remain unchanged, because we haven’t run the
 # optimizer yet. The optimizer is responsible for updating model weights
 # based on the computed gradients.
-# 
+#
 
 optimizer.step()
 print(model.layer2.weight[0][0:10])
@@ -318,12 +324,12 @@ def forward(self, x):
 
 ######################################################################
 # You should see that ``layer2``\ ’s weights have changed.
-# 
+#
 # One important thing about the process: After calling
 # ``optimizer.step()``, you need to call ``optimizer.zero_grad()``, or
 # else every time you run ``loss.backward()``, the gradients on the
 # learning weights will accumulate:
-# 
+#
 
 print(model.layer2.weight.grad[0][0:10])
 
@@ -331,7 +337,7 @@ def forward(self, x):
     prediction = model(some_input)
     loss = (ideal_output - prediction).pow(2).sum()
     loss.backward()
-    
+
 print(model.layer2.weight.grad[0][0:10])
 
 optimizer.zero_grad(set_to_none=False)
@@ -345,17 +351,17 @@ def forward(self, x):
 # gradients will be much larger. Failing to zero the gradients before
 # running your next training batch will cause the gradients to blow up in
 # this manner, causing incorrect and unpredictable learning results.
-# 
+#
 # Turning Autograd Off and On
 # ---------------------------
-# 
+#
 # There are situations where you will need fine-grained control over
 # whether autograd is enabled. There are multiple ways to do this,
 # depending on the situation.
-# 
+#
 # The simplest is to change the ``requires_grad`` flag on a tensor
 # directly:
-# 
+#
 
 a = torch.ones(2, 3, requires_grad=True)
 print(a)
@@ -374,10 +380,10 @@ def forward(self, x):
 # a tensor, ``a``, that had autograd turned on. When we turn off autograd
 # explicitly with ``a.requires_grad = False``, computation history is no
 # longer tracked, as we see when we compute ``b2``.
-# 
+#
 # If you only need autograd turned off temporarily, a better way is to use
 # the ``torch.no_grad()``:
-# 
+#
 
 a = torch.ones(2, 3, requires_grad=True) * 2
 b = torch.ones(2, 3, requires_grad=True) * 3
@@ -396,11 +402,13 @@ def forward(self, x):
 
 ##########################################################################
 # ``torch.no_grad()`` can also be used as a function or method decorator:
-# 
+#
+
 
 def add_tensors1(x, y):
     return x + y
 
+
 @torch.no_grad()
 def add_tensors2(x, y):
     return x + y
@@ -420,12 +428,12 @@ def add_tensors2(x, y):
 # There’s a corresponding context manager, ``torch.enable_grad()``, for
 # turning autograd on when it isn’t already. It may also be used as a
 # decorator.
-# 
+#
 # Finally, you may have a tensor that requires gradient tracking, but you
 # want a copy that does not. For this we have the ``Tensor`` object’s
 # ``detach()`` method - it creates a copy of the tensor that is *detached*
 # from the computation history:
-# 
+#
 
 x = torch.rand(5, requires_grad=True)
 y = x.detach()
@@ -440,10 +448,10 @@ def add_tensors2(x, y):
 # conversion from a PyTorch tensor to a NumPy array is not enabled for
 # tensors with requires_grad=True. Making a detached copy lets us move
 # forward.
-# 
+#
 # Autograd and In-place Operations
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # In every example in this notebook so far, we’ve used variables to
 # capture the intermediate values of a computation. Autograd needs these
 # intermediate values to perform gradient computations. *For this reason,
@@ -452,32 +460,32 @@ def add_tensors2(x, y):
 # derivatives in the ``backward()`` call. PyTorch will even stop you if
 # you attempt an in-place operation on leaf variable that requires
 # autograd, as shown below.
-# 
+#
 # .. note::
 #     The following code cell throws a runtime error. This is expected.
-# 
+#
 #    .. code-block:: python
 #
 #       a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
-#       torch.sin_(a)
+#       a.sin_()
 #
 
 #########################################################################
 # Autograd Profiler
 # -----------------
-# 
+#
 # Autograd tracks every step of your computation in detail. Such a
 # computation history, combined with timing information, would make a
 # handy profiler - and autograd has that feature baked in. Here’s a quick
 # example usage:
-# 
+#
 
-device = torch.device('cpu')
+device = torch.device("cpu")
 run_on_gpu = False
 if torch.cuda.is_available():
-    device = torch.device('cuda')
+    device = torch.device("cuda")
     run_on_gpu = True
-    
+
 x = torch.randn(2, 3, requires_grad=True)
 y = torch.rand(2, 3, requires_grad=True)
 z = torch.ones(2, 3, requires_grad=True)
@@ -485,8 +493,8 @@ def add_tensors2(x, y):
 with torch.autograd.profiler.profile(use_cuda=run_on_gpu) as prf:
     for _ in range(1000):
         z = (z / x) * y
-        
-print(prf.key_averages().table(sort_by='self_cpu_time_total'))
+
+print(prf.key_averages().table(sort_by="self_cpu_time_total"))
 
 
 ##########################################################################
@@ -494,15 +502,15 @@ def add_tensors2(x, y):
 # data by input tensor shape, and export data as a Chrome tracing tools
 # file. For full details of the API, see the
 # `documentation <https://pytorch.org/docs/stable/autograd.html#profiler>`__.
-# 
+#
 # Advanced Topic: More Autograd Detail and the High-Level API
 # -----------------------------------------------------------
-# 
+#
 # If you have a function with an n-dimensional input and m-dimensional
 # output, :math:`\vec{y}=f(\vec{x})`, the complete gradient is a matrix of
 # the derivative of every output with respect to every input, called the
 # *Jacobian:*
-# 
+#
 # .. math::
 #
 #      J
@@ -512,22 +520,22 @@ def add_tensors2(x, y):
 #      \vdots & \ddots & \vdots\\
 #      \frac{\partial y_{m}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
 #      \end{array}\right)
-# 
+#
 # If you have a second function, :math:`l=g\left(\vec{y}\right)` that
 # takes m-dimensional input (that is, the same dimensionality as the
 # output above), and returns a scalar output, you can express its
 # gradients with respect to :math:`\vec{y}` as a column vector,
 # :math:`v=\left(\begin{array}{ccc}\frac{\partial l}{\partial y_{1}} & \cdots & \frac{\partial l}{\partial y_{m}}\end{array}\right)^{T}`
 # - which is really just a one-column Jacobian.
-# 
+#
 # More concretely, imagine the first function as your PyTorch model (with
 # potentially many inputs and many outputs) and the second function as a
 # loss function (with the model’s output as input, and the loss value as
 # the scalar output).
-# 
+#
 # If we multiply the first function’s Jacobian by the gradient of the
 # second function, and apply the chain rule, we get:
-# 
+#
 # .. math::
 #
 #    J^{T}\cdot v=\left(\begin{array}{ccc}
@@ -543,24 +551,24 @@ def add_tensors2(x, y):
 #    \vdots\\
 #    \frac{\partial l}{\partial x_{n}}
 #    \end{array}\right)
-# 
+#
 # Note: You could also use the equivalent operation :math:`v^{T}\cdot J`,
 # and get back a row vector.
-# 
+#
 # The resulting column vector is the *gradient of the second function with
 # respect to the inputs of the first* - or in the case of our model and
 # loss function, the gradient of the loss with respect to the model
 # inputs.
-# 
+#
 # **``torch.autograd`` is an engine for computing these products.** This
 # is how we accumulate the gradients over the learning weights during the
 # backward pass.
-# 
+#
 # For this reason, the ``backward()`` call can *also* take an optional
 # vector input. This vector represents a set of gradients over the tensor,
 # which are multiplied by the Jacobian of the autograd-traced tensor that
 # precedes it. Let’s try a specific example with a small vector:
-# 
+#
 
 x = torch.randn(3, requires_grad=True)
 
@@ -577,9 +585,9 @@ def add_tensors2(x, y):
 # outputs. For a multi-dimensional output, autograd expects us to provide
 # gradients for those three outputs that it can multiply into the
 # Jacobian:
-# 
+#
 
-v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float) # stand-in for gradients
+v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float)  # stand-in for gradients
 y.backward(v)
 
 print(x.grad)
@@ -588,25 +596,27 @@ def add_tensors2(x, y):
 ##########################################################################
 # (Note that the output gradients are all related to powers of two - which
 # we’d expect from a repeated doubling operation.)
-# 
+#
 # The High-Level API
 # ~~~~~~~~~~~~~~~~~~
-# 
+#
 # There is an API on autograd that gives you direct access to important
 # differential matrix and vector operations. In particular, it allows you
 # to calculate the Jacobian and the *Hessian* matrices of a particular
 # function for particular inputs. (The Hessian is like the Jacobian, but
 # expresses all partial *second* derivatives.) It also provides methods
 # for taking vector products with these matrices.
-# 
+#
 # Let’s take the Jacobian of a simple function, evaluated for a 2
 # single-element inputs:
-# 
+#
+
 
 def exp_adder(x, y):
     return 2 * x.exp() + 3 * y
 
-inputs = (torch.rand(1), torch.rand(1)) # arguments for the function
+
+inputs = (torch.rand(1), torch.rand(1))  # arguments for the function
 print(inputs)
 torch.autograd.functional.jacobian(exp_adder, inputs)
 
@@ -615,11 +625,11 @@ def exp_adder(x, y):
 # If you look closely, the first output should equal :math:`2e^x` (since
 # the derivative of :math:`e^x` is :math:`e^x`), and the second value
 # should be 3.
-# 
+#
 # You can, of course, do this with higher-order tensors:
-# 
+#
 
-inputs = (torch.rand(3), torch.rand(3)) # arguments for the function
+inputs = (torch.rand(3), torch.rand(3))  # arguments for the function
 print(inputs)
 torch.autograd.functional.jacobian(exp_adder, inputs)
 
@@ -628,10 +638,11 @@ def exp_adder(x, y):
 # The ``torch.autograd.functional.hessian()`` method works identically
 # (assuming your function is twice differentiable), but returns a matrix
 # of all second derivatives.
-# 
+#
 # There is also a function to directly compute the vector-Jacobian
 # product, if you provide the vector:
-# 
+#
+
 
 def do_some_doubling(x):
     y = x * 2
@@ -639,6 +650,7 @@ def do_some_doubling(x):
         y = y * 2
     return y
 
+
 inputs = torch.randn(3)
 my_gradients = torch.tensor([0.1, 1.0, 0.0001])
 torch.autograd.functional.vjp(do_some_doubling, inputs, v=my_gradients)
@@ -648,8 +660,8 @@ def do_some_doubling(x):
 # The ``torch.autograd.functional.jvp()`` method performs the same matrix
 # multiplication as ``vjp()`` with the operands reversed. The ``vhp()``
 # and ``hvp()`` methods do the same for a vector-Hessian product.
-# 
+#
 # For more information, including performance notes on the `docs for the
 # functional
 # API <https://pytorch.org/docs/stable/autograd.html#functional-higher-level-api>`__
-# 
+#
diff --git a/beginner_source/introyt/tensors_deeper_tutorial.py b/beginner_source/introyt/tensors_deeper_tutorial.py
index 4d118ad4030..0f20733148a 100644
--- a/beginner_source/introyt/tensors_deeper_tutorial.py
+++ b/beginner_source/introyt/tensors_deeper_tutorial.py
@@ -30,13 +30,12 @@
 import torch
 import math
 
-
 #########################################################################
 # Creating Tensors
 # ----------------
-# 
+#
 # The simplest way to create a tensor is with the ``torch.empty()`` call:
-# 
+#
 
 x = torch.empty(3, 4)
 print(type(x))
@@ -45,7 +44,7 @@
 
 ##########################################################################
 # Let’s upack what we just did:
-# 
+#
 # -  We created a tensor using one of the numerous factory methods
 #    attached to the ``torch`` module.
 # -  The tensor itself is 2-dimensional, having 3 rows and 4 columns.
@@ -57,21 +56,21 @@
 #    tensor. The ``torch.empty()`` call allocates memory for the tensor,
 #    but does not initialize it with any values - so what you’re seeing is
 #    whatever was in memory at the time of allocation.
-# 
+#
 # A brief note about tensors and their number of dimensions, and
 # terminology:
-# 
+#
 # -  You will sometimes see a 1-dimensional tensor called a
-#    *vector.* 
+#    *vector.*
 # -  Likewise, a 2-dimensional tensor is often referred to as a
-#    *matrix.* 
+#    *matrix.*
 # -  Anything with more than two dimensions is generally just
 #    called a tensor.
-# 
+#
 # More often than not, you’ll want to initialize your tensor with some
 # value. Common cases are all zeros, all ones, or random values, and the
 # ``torch`` module provides factory methods for all of these:
-# 
+#
 
 zeros = torch.zeros(2, 3)
 print(zeros)
@@ -88,10 +87,10 @@
 # The factory methods all do just what you’d expect - we have a tensor
 # full of zeros, another full of ones, and another with random values
 # between 0 and 1.
-# 
+#
 # Random Tensors and Seeding
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # Speaking of the random tensor, did you notice the call to
 # ``torch.manual_seed()`` immediately preceding it? Initializing tensors,
 # such as a model’s learning weights, with random values is common but
@@ -99,7 +98,7 @@
 # some assurance of the reproducibility of your results. Manually setting
 # your random number generator’s seed is the way to do this. Let’s look
 # more closely:
-# 
+#
 
 torch.manual_seed(1729)
 random1 = torch.rand(2, 3)
@@ -121,18 +120,18 @@
 # identical values, as do ``random2`` and ``random4``. Manually setting
 # the RNG’s seed resets it, so that identical computations depending on
 # random number should, in most settings, provide identical results.
-# 
+#
 # For more information, see the `PyTorch documentation on
 # reproducibility <https://pytorch.org/docs/stable/notes/randomness.html>`__.
-# 
+#
 # Tensor Shapes
 # ~~~~~~~~~~~~~
-# 
+#
 # Often, when you’re performing operations on two or more tensors, they
 # will need to be of the same *shape* - that is, having the same number of
 # dimensions and the same number of cells in each dimension. For that, we
 # have the ``torch.*_like()`` methods:
-# 
+#
 
 x = torch.empty(2, 2, 3)
 print(x.shape)
@@ -160,15 +159,15 @@
 # property on a tensor. This property contains a list of the extent of
 # each dimension of a tensor - in our case, ``x`` is a three-dimensional
 # tensor with shape 2 x 2 x 3.
-# 
+#
 # Below that, we call the ``.empty_like()``, ``.zeros_like()``,
 # ``.ones_like()``, and ``.rand_like()`` methods. Using the ``.shape``
 # property, we can verify that each of these methods returns a tensor of
 # identical dimensionality and extent.
-# 
+#
 # The last way to create a tensor that will cover is to specify its data
 # directly from a PyTorch collection:
-# 
+#
 
 some_constants = torch.tensor([[3.1415926, 2.71828], [1.61803, 0.0072897]])
 print(some_constants)
@@ -185,20 +184,20 @@
 # tensor if you already have data in a Python tuple or list. As shown
 # above, nesting the collections will result in a multi-dimensional
 # tensor.
-# 
+#
 # .. note::
 #      ``torch.tensor()`` creates a copy of the data.
-# 
+#
 # Tensor Data Types
 # ~~~~~~~~~~~~~~~~~
-# 
+#
 # Setting the datatype of a tensor is possible a couple of ways:
-# 
+#
 
 a = torch.ones((2, 3), dtype=torch.int16)
 print(a)
 
-b = torch.rand((2, 3), dtype=torch.float64) * 20.
+b = torch.rand((2, 3), dtype=torch.float64) * 20.0
 print(b)
 
 c = b.to(torch.int32)
@@ -211,40 +210,40 @@
 # we set ``dtype=torch.int16`` for the tensor ``a``. When we print ``a``,
 # we can see that it’s full of ``1`` rather than ``1.`` - Python’s subtle
 # cue that this is an integer type rather than floating point.
-# 
+#
 # Another thing to notice about printing ``a`` is that, unlike when we
 # left ``dtype`` as the default (32-bit floating point), printing the
 # tensor also specifies its ``dtype``.
-# 
+#
 # You may have also spotted that we went from specifying the tensor’s
 # shape as a series of integer arguments, to grouping those arguments in a
 # tuple. This is not strictly necessary - PyTorch will take a series of
 # initial, unlabeled integer arguments as a tensor shape - but when adding
 # the optional arguments, it can make your intent more readable.
-# 
+#
 # The other way to set the datatype is with the ``.to()`` method. In the
 # cell above, we create a random floating point tensor ``b`` in the usual
 # way. Following that, we create ``c`` by converting ``b`` to a 32-bit
 # integer with the ``.to()`` method. Note that ``c`` contains all the same
 # values as ``b``, but truncated to integers.
-# 
+#
 # For more information, see the `data types documentation <https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype>`__.
-# 
+#
 # Math & Logic with PyTorch Tensors
 # ---------------------------------
-# 
+#
 # Now that you know some of the ways to create a tensor… what can you do
 # with them?
-# 
+#
 # Let’s look at basic arithmetic first, and how tensors interact with
 # simple scalars:
-# 
+#
 
 ones = torch.zeros(2, 2) + 1
 twos = torch.ones(2, 2) * 2
 threes = (torch.ones(2, 2) * 7 - 1) / 2
-fours = twos ** 2
-sqrt2s = twos ** 0.5
+fours = twos**2
+sqrt2s = twos**0.5
 
 print(ones)
 print(twos)
@@ -260,10 +259,10 @@
 # the output of such an operation will be a tensor, you can chain them
 # together with the usual operator precedence rules, as in the line where
 # we create ``threes``.
-# 
+#
 # Similar operations between two tensors also behave like you’d
 # intuitively expect:
-# 
+#
 
 powers2 = twos ** torch.tensor([[1, 2], [3, 4]])
 print(powers2)
@@ -279,7 +278,7 @@
 # It’s important to note here that all of the tensors in the previous code
 # cell were of identical shape. What happens when we try to perform a
 # binary operation on tensors if dissimilar shape?
-# 
+#
 # .. note::
 #    The following cell throws a run-time error. This is intentional.
 #
@@ -296,17 +295,17 @@
 # In the general case, you cannot operate on tensors of different shape
 # this way, even in a case like the cell above, where the tensors have an
 # identical number of elements.
-# 
+#
 # In Brief: Tensor Broadcasting
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # .. note::
 #      If you are familiar with broadcasting semantics in NumPy
 #      ndarrays, you’ll find the same rules apply here.
-# 
+#
 # The exception to the same-shapes rule is *tensor broadcasting.* Here’s
 # an example:
-# 
+#
 
 rand = torch.rand(2, 4)
 doubled = rand * (torch.ones(1, 4) * 2)
@@ -318,66 +317,66 @@
 #########################################################################
 # What’s the trick here? How is it we got to multiply a 2x4 tensor by a
 # 1x4 tensor?
-# 
+#
 # Broadcasting is a way to perform an operation between tensors that have
 # similarities in their shapes. In the example above, the one-row,
 # four-column tensor is multiplied by *both rows* of the two-row,
 # four-column tensor.
-# 
+#
 # This is an important operation in Deep Learning. The common example is
 # multiplying a tensor of learning weights by a *batch* of input tensors,
 # applying the operation to each instance in the batch separately, and
 # returning a tensor of identical shape - just like our (2, 4) \* (1, 4)
 # example above returned a tensor of shape (2, 4).
-# 
+#
 # The rules for broadcasting are:
-# 
+#
 # -  Each tensor must have at least one dimension - no empty tensors.
-# 
+#
 # -  Comparing the dimension sizes of the two tensors, *going from last to
 #    first:*
-# 
+#
 #    -  Each dimension must be equal, *or*
-# 
+#
 #    -  One of the dimensions must be of size 1, *or*
-# 
+#
 #    -  The dimension does not exist in one of the tensors
-# 
+#
 # Tensors of identical shape, of course, are trivially “broadcastable”, as
 # you saw earlier.
-# 
+#
 # Here are some examples of situations that honor the above rules and
 # allow broadcasting:
-# 
+#
 
-a =     torch.ones(4, 3, 2)
+a = torch.ones(4, 3, 2)
 
-b = a * torch.rand(   3, 2) # 3rd & 2nd dims identical to a, dim 1 absent
+b = a * torch.rand(3, 2)  # 3rd & 2nd dims identical to a, dim 1 absent
 print(b)
 
-c = a * torch.rand(   3, 1) # 3rd dim = 1, 2nd dim identical to a
+c = a * torch.rand(3, 1)  # 3rd dim = 1, 2nd dim identical to a
 print(c)
 
-d = a * torch.rand(   1, 2) # 3rd dim identical to a, 2nd dim = 1
+d = a * torch.rand(1, 2)  # 3rd dim identical to a, 2nd dim = 1
 print(d)
 
 
 #############################################################################
-# Look closely at the values of each tensor above: 
+# Look closely at the values of each tensor above:
 #
-# -  The multiplication operation that created ``b`` was 
+# -  The multiplication operation that created ``b`` was
 #    broadcast over every “layer” of ``a``.
 # -  For ``c``, the operation was broadcast over every layer and row of
-#    ``a`` - every 3-element column is identical. 
+#    ``a`` - every 3-element column is identical.
 # -  For ``d``, we switched it around - now every *row* is identical,
 #    across layers and columns.
-# 
+#
 # For more information on broadcasting, see the `PyTorch
 # documentation <https://pytorch.org/docs/stable/notes/broadcasting.html>`__
 # on the topic.
-# 
+#
 # Here are some examples of attempts at broadcasting that will fail:
-# 
+#
 # .. note::
 #    The following cell throws a run-time error. This is intentional.
 #
@@ -396,16 +395,16 @@
 ###########################################################################
 # More Math with Tensors
 # ~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # PyTorch tensors have over three hundred operations that can be performed
 # on them.
-# 
+#
 # Here is a small sample from some of the major categories of operations:
-# 
+#
 
 # common functions
 a = torch.rand(2, 4) * 2 - 1
-print('Common functions:')
+print("Common functions:")
 print(torch.abs(a))
 print(torch.ceil(a))
 print(torch.floor(a))
@@ -415,44 +414,44 @@
 angles = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4])
 sines = torch.sin(angles)
 inverses = torch.asin(sines)
-print('\nSine and arcsine:')
+print("\nSine and arcsine:")
 print(angles)
 print(sines)
 print(inverses)
 
 # bitwise operations
-print('\nBitwise XOR:')
+print("\nBitwise XOR:")
 b = torch.tensor([1, 5, 11])
 c = torch.tensor([2, 7, 10])
 print(torch.bitwise_xor(b, c))
 
 # comparisons:
-print('\nBroadcasted, element-wise equality comparison:')
-d = torch.tensor([[1., 2.], [3., 4.]])
+print("\nBroadcasted, element-wise equality comparison:")
+d = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
 e = torch.ones(1, 2)  # many comparison ops support broadcasting!
-print(torch.eq(d, e)) # returns a tensor of type bool
+print(torch.eq(d, e))  # returns a tensor of type bool
 
 # reductions:
-print('\nReduction ops:')
-print(torch.max(d))        # returns a single-element tensor
-print(torch.max(d).item()) # extracts the value from the returned tensor
-print(torch.mean(d))       # average
-print(torch.std(d))        # standard deviation
-print(torch.prod(d))       # product of all numbers
-print(torch.unique(torch.tensor([1, 2, 1, 2, 1, 2]))) # filter unique elements
+print("\nReduction ops:")
+print(torch.max(d))  # returns a single-element tensor
+print(torch.max(d).item())  # extracts the value from the returned tensor
+print(torch.mean(d))  # average
+print(torch.std(d))  # standard deviation
+print(torch.prod(d))  # product of all numbers
+print(torch.unique(torch.tensor([1, 2, 1, 2, 1, 2])))  # filter unique elements
 
 # vector and linear algebra operations
-v1 = torch.tensor([1., 0., 0.])         # x unit vector
-v2 = torch.tensor([0., 1., 0.])         # y unit vector
-m1 = torch.rand(2, 2)                   # random matrix
-m2 = torch.tensor([[3., 0.], [0., 3.]]) # three times identity matrix
+v1 = torch.tensor([1.0, 0.0, 0.0])  # x unit vector
+v2 = torch.tensor([0.0, 1.0, 0.0])  # y unit vector
+m1 = torch.rand(2, 2)  # random matrix
+m2 = torch.tensor([[3.0, 0.0], [0.0, 3.0]])  # three times identity matrix
 
-print('\nVectors & Matrices:')
-print(torch.linalg.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1)
+print("\nVectors & Matrices:")
+print(torch.linalg.cross(v2, v1))  # negative of z unit vector (v1 x v2 == -v2 x v1)
 print(m1)
-m3 = torch.linalg.matmul(m1, m2)
-print(m3)                  # 3 times m1
-print(torch.linalg.svd(m3))       # singular value decomposition
+m3 = torch.matmul(m1, m2)
+print(m3)  # 3 times m1
+print(torch.linalg.svd(m3))  # singular value decomposition
 
 
 ##################################################################################
@@ -461,69 +460,70 @@
 # `documentation <https://pytorch.org/docs/stable/torch.html#math-operations>`__.
 # For more details and the full inventory of linear algebra operations, have a
 # look at this `documentation <https://pytorch.org/docs/stable/linalg.html>`__.
-# 
+#
 # Altering Tensors in Place
 # ~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # Most binary operations on tensors will return a third, new tensor. When
 # we say ``c = a * b`` (where ``a`` and ``b`` are tensors), the new tensor
 # ``c`` will occupy a region of memory distinct from the other tensors.
-# 
+#
 # There are times, though, that you may wish to alter a tensor in place -
 # for example, if you’re doing an element-wise computation where you can
 # discard intermediate values. For this, most of the math functions have a
 # version with an appended underscore (``_``) that will alter a tensor in
 # place.
-# 
+#
 # For example:
-# 
+#
 
 a = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4])
-print('a:')
+print("a:")
 print(a)
-print(torch.sin(a))   # this operation creates a new tensor in memory
-print(a)              # a has not changed
+print(torch.sin(a))  # this operation creates a new tensor in memory
+print(a)  # a has not changed
 
 b = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4])
-print('\nb:')
+print("\nb:")
 print(b)
-print(torch.sin_(b))  # note the underscore
-print(b)              # b has changed
+print(b.sin_())  # note the underscore
+print(b)  # b has changed
 
 
 #######################################################################
 # For arithmetic operations, there are functions that behave similarly:
-# 
+#
 
 a = torch.ones(2, 2)
 b = torch.rand(2, 2)
 
-print('Before:')
+print("Before:")
 print(a)
 print(b)
-print('\nAfter adding:')
+print("\nAfter adding:")
 print(a.add_(b))
 print(a)
 print(b)
-print('\nAfter multiplying')
+print("\nAfter multiplying")
 print(b.mul_(b))
 print(b)
 
 
 ##########################################################################
 # Note that these in-place arithmetic functions are methods on the
-# ``torch.Tensor`` object, not attached to the ``torch`` module like many
-# other functions (e.g., ``torch.sin()``). As you can see from
-# ``a.add_(b)``, *the calling tensor is the one that gets changed in
-# place.*
-# 
+# ``torch.Tensor`` object. As you can see from ``a.add_(b)``, *the calling
+# tensor is the one that gets changed in place.* Similarly, the in-place
+# trigonometric functions like ``sin_()`` are also tensor methods rather than
+# module-level functions.
+#
+#
 # There is another option for placing the result of a computation in an
 # existing, allocated tensor. Many of the methods and functions we’ve seen
 # so far - including creation methods! - have an ``out`` argument that
 # lets you specify a tensor to receive the output. If the ``out`` tensor
 # is the correct shape and ``dtype``, this can happen without a new memory
 # allocation:
-# 
+#
 
 a = torch.rand(2, 2)
 b = torch.rand(2, 2)
@@ -532,44 +532,44 @@
 
 print(c)
 d = torch.matmul(a, b, out=c)
-print(c)                # contents of c have changed
+print(c)  # contents of c have changed
 
-assert c is d           # test c & d are same object, not just containing equal values
+assert c is d  # test c & d are same object, not just containing equal values
 assert id(c) == old_id  # make sure that our new c is the same object as the old one
 
-torch.rand(2, 2, out=c) # works for creation too!
-print(c)                # c has changed again
+torch.rand(2, 2, out=c)  # works for creation too!
+print(c)  # c has changed again
 assert id(c) == old_id  # still the same object!
 
 
 ##########################################################################
 # Copying Tensors
 # ---------------
-# 
+#
 # As with any object in Python, assigning a tensor to a variable makes the
 # variable a *label* of the tensor, and does not copy it. For example:
-# 
+#
 
 a = torch.ones(2, 2)
 b = a
 
 a[0][1] = 561  # we change a...
-print(b)       # ...and b is also altered
+print(b)  # ...and b is also altered
 
 
 ######################################################################
 # But what if you want a separate copy of the data to work on? The
 # ``clone()`` method is there for you:
-# 
+#
 
 a = torch.ones(2, 2)
 b = a.clone()
 
-assert b is not a      # different objects in memory...
+assert b is not a  # different objects in memory...
 print(torch.eq(a, b))  # ...but still with the same contents!
 
-a[0][1] = 561          # a changes...
-print(b)               # ...but b is still all ones
+a[0][1] = 561  # a changes...
+print(b)  # ...but b is still all ones
 
 
 #########################################################################
@@ -577,7 +577,7 @@
 # If your source tensor has autograd, enabled then so will the clone.
 # **This will be covered more deeply in the video on autograd,** but if
 # you want the light version of the details, continue on.
-# 
+#
 # *In many cases, this will be what you want.* For example, if your model
 # has multiple computation paths in its ``forward()`` method, and *both*
 # the original tensor and its clone contribute to the model’s output, then
@@ -585,11 +585,11 @@
 # If your source tensor has autograd enabled (which it generally will if
 # it’s a set of learning weights or derived from a computation involving
 # the weights), then you’ll get the result you want.
-# 
+#
 # On the other hand, if you’re doing a computation where *neither* the
 # original tensor nor its clone need to track gradients, then as long as
 # the source tensor has autograd turned off, you’re good to go.
-# 
+#
 # *There is a third case,* though: Imagine you’re performing a computation
 # in your model’s ``forward()`` function, where gradients are turned on
 # for everything by default, but you want to pull out some values
@@ -597,9 +597,9 @@
 # cloned copy of your source tensor to track gradients - performance is
 # improved with autograd’s history tracking turned off. For this, you can
 # use the ``.detach()`` method on the source tensor:
-# 
+#
 
-a = torch.rand(2, 2, requires_grad=True) # turn on autograd
+a = torch.rand(2, 2, requires_grad=True)  # turn on autograd
 print(a)
 
 b = a.clone()
@@ -613,7 +613,7 @@
 
 #########################################################################
 # What’s happening here?
-# 
+#
 # -  We create ``a`` with ``requires_grad=True`` turned on. **We haven’t
 #    covered this optional argument yet, but will during the unit on
 #    autograd.**
@@ -626,33 +626,33 @@
 # -  We clone ``a`` into ``c``, but we call ``detach()`` first.
 # -  Printing ``c``, we see no computation history, and no
 #    ``requires_grad=True``.
-# 
+#
 # The ``detach()`` method *detaches the tensor from its computation
 # history.* It says, “do whatever comes next as if autograd was off.” It
 # does this *without* changing ``a`` - you can see that when we print
 # ``a`` again at the end, it retains its ``requires_grad=True`` property.
-# 
+#
 # Moving to `Accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
 # -------------
-# 
+#
 # One of the major advantages of PyTorch is its robust acceleration on an
 # `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
-# such as CUDA, MPS, MTIA, or XPU. 
+# such as CUDA, MPS, MTIA, or XPU.
 # So far, everything we’ve done has been on CPU. How do we move to the faster
 # hardware?
-# 
+#
 # First, we should check whether an accelerator is available, with the
 # ``is_available()`` method.
-# 
+#
 # .. note::
 #      If you do not have an accelerator, the executable cells in this section will not execute any
 #      accelerator-related code.
-# 
+#
 
 if torch.accelerator.is_available():
-    print('We have an accelerator!')
+    print("We have an accelerator!")
 else:
-    print('Sorry, CPU only.')
+    print("Sorry, CPU only.")
 
 
 ##########################################################################
@@ -663,16 +663,16 @@
 # move *all* the data needed for that computation to memory accessible by
 # that device. (Colloquially, “moving the data to memory accessible by the
 # GPU” is shorted to, “moving the data to the GPU”.)
-# 
+#
 # There are multiple ways to get your data onto your target device. You
 # may do it at creation time:
-# 
+#
 
 if torch.accelerator.is_available():
     gpu_rand = torch.rand(2, 2, device=torch.accelerator.current_accelerator())
     print(gpu_rand)
 else:
-    print('Sorry, CPU only.')
+    print("Sorry, CPU only.")
 
 
 ##########################################################################
@@ -680,20 +680,24 @@
 # when we want to create our tensor on the accelerator with the optional
 # ``device`` argument. You can see when we print the new tensor, PyTorch
 # informs us which device it’s on (if it’s not on CPU).
-# 
+#
 # You can query the number of accelerators with ``torch.accelerator.device_count()``. If
 # you have more than one accelerator, you can specify them by index, take CUDA for example:
 # ``device='cuda:0'``, ``device='cuda:1'``, etc.
-# 
+#
 # As a coding practice, specifying our devices everywhere with string
 # constants is pretty fragile. In an ideal world, your code would perform
 # robustly whether you’re on CPU or accelerator hardware. You can do this by
 # creating a device handle that can be passed to your tensors instead of a
 # string:
-# 
+#
 
-my_device = torch.accelerator.current_accelerator() if torch.accelerator.is_available() else torch.device('cpu')
-print('Device: {}'.format(my_device))
+my_device = (
+    torch.accelerator.current_accelerator()
+    if torch.accelerator.is_available()
+    else torch.device("cpu")
+)
+print(f"Device: {my_device}")
 
 x = torch.rand(2, 2, device=my_device)
 print(x)
@@ -704,7 +708,7 @@
 # another with the ``to()`` method. The following line of code creates a
 # tensor on CPU, and moves it to whichever device handle you acquired in
 # the previous cell.
-# 
+#
 
 y = torch.rand(2, 2)
 y = y.to(my_device)
@@ -715,35 +719,35 @@
 # more tensors, *all of the tensors must be on the same device*. The
 # following code will throw a runtime error, regardless of whether you
 # have an accelerator device available, take CUDA for example:
-# 
+#
 # .. code-block:: python
-# 
+#
 #    x = torch.rand(2, 2)
 #    y = torch.rand(2, 2, device='cuda')
 #    z = x + y  # exception will be thrown
-# 
+#
 
 
 ###########################################################################
 # Manipulating Tensor Shapes
 # --------------------------
-# 
+#
 # Sometimes, you’ll need to change the shape of your tensor. Below, we’ll
 # look at a few common cases, and how to handle them.
-# 
+#
 # Changing the Number of Dimensions
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # One case where you might need to change the number of dimensions is
 # passing a single instance of input to your model. PyTorch models
 # generally expect *batches* of input.
-# 
+#
 # For example, imagine having a model that works on 3 x 226 x 226 images -
 # a 226-pixel square with 3 color channels. When you load and transform
 # it, you’ll get a tensor of shape ``(3, 226, 226)``. Your model, though,
 # is expecting input of shape ``(N, 3, 226, 226)``, where ``N`` is the
 # number of images in the batch. So how do you make a batch of one?
-# 
+#
 
 a = torch.rand(3, 226, 226)
 b = a.unsqueeze(0)
@@ -756,11 +760,11 @@
 # The ``unsqueeze()`` method adds a dimension of extent 1.
 # ``unsqueeze(0)`` adds it as a new zeroth dimension - now you have a
 # batch of one!
-# 
+#
 # So if that’s *un*\ squeezing? What do we mean by squeezing? We’re taking
 # advantage of the fact that any dimension of extent 1 *does not* change
 # the number of elements in the tensor.
-# 
+#
 
 c = torch.rand(1, 1, 1, 1, 1)
 print(c)
@@ -772,10 +776,10 @@
 # have shape ``(N, 20)``, where ``N`` is the number of instances in the
 # input batch. That means that for our single-input batch, we’ll get an
 # output of shape ``(1, 20)``.
-# 
+#
 # What if you want to do some *non-batched* computation with that output -
 # something that’s just expecting a 20-element vector?
-# 
+#
 
 a = torch.rand(1, 20)
 print(a.shape)
@@ -797,44 +801,44 @@
 # 1-dimensional, and if you look closely at the output of the cell above
 # you’ll see that printing ``a`` shows an “extra” set of square brackets
 # ``[]`` due to having an extra dimension.
-# 
+#
 # You may only ``squeeze()`` dimensions of extent 1. See above where we
 # try to squeeze a dimension of size 2 in ``c``, and get back the same
 # shape we started with. Calls to ``squeeze()`` and ``unsqueeze()`` can
 # only act on dimensions of extent 1 because to do otherwise would change
 # the number of elements in the tensor.
-# 
+#
 # Another place you might use ``unsqueeze()`` is to ease broadcasting.
 # Recall the example above where we had the following code:
-# 
+#
 # .. code-block:: python
-# 
+#
 #    a = torch.ones(4, 3, 2)
-# 
+#
 #    c = a * torch.rand(   3, 1) # 3rd dim = 1, 2nd dim identical to a
 #    print(c)
-# 
+#
 # The net effect of that was to broadcast the operation over dimensions 0
 # and 2, causing the random, 3 x 1 tensor to be multiplied element-wise by
 # every 3-element column in ``a``.
-# 
+#
 # What if the random vector had just been 3-element vector? We’d lose the
 # ability to do the broadcast, because the final dimensions would not
 # match up according to the broadcasting rules. ``unsqueeze()`` comes to
 # the rescue:
-# 
+#
 
 a = torch.ones(4, 3, 2)
-b = torch.rand(   3)     # trying to multiply a * b will give a runtime error
-c = b.unsqueeze(1)       # change to a 2-dimensional tensor, adding new dim at the end
+b = torch.rand(3)  # trying to multiply a * b will give a runtime error
+c = b.unsqueeze(1)  # change to a 2-dimensional tensor, adding new dim at the end
 print(c.shape)
-print(a * c)             # broadcasting works again!
+print(a * c)  # broadcasting works again!
 
 
 ######################################################################
 # The ``squeeze()`` and ``unsqueeze()`` methods also have in-place
 # versions, ``squeeze_()`` and ``unsqueeze_()``:
-# 
+#
 
 batch_me = torch.rand(3, 226, 226)
 print(batch_me.shape)
@@ -852,7 +856,7 @@
 # layer expects a 1-dimensional input. ``reshape()`` will do this for you,
 # provided that the dimensions you request yield the same number of
 # elements as the input tensor has:
-# 
+#
 
 output3d = torch.rand(6, 20, 20)
 print(output3d.shape)
@@ -872,34 +876,34 @@
 #      lets us cheat and just use a series of integers. Here, we had to add the
 #      parentheses and comma to convince the method that this is really a
 #      one-element tuple.
-# 
+#
 # When it can, ``reshape()`` will return a *view* on the tensor to be
 # changed - that is, a separate tensor object looking at the same
 # underlying region of memory. *This is important:* That means any change
 # made to the source tensor will be reflected in the view on that tensor,
 # unless you ``clone()`` it.
-# 
+#
 # There *are* conditions, beyond the scope of this introduction, where
 # ``reshape()`` has to return a tensor carrying a copy of the data. For
 # more information, see the
 # `docs <https://pytorch.org/docs/stable/torch.html#torch.reshape>`__.
-# 
+#
 
 
 #######################################################################
 # NumPy Bridge
 # ------------
-# 
+#
 # In the section above on broadcasting, it was mentioned that PyTorch’s
 # broadcast semantics are compatible with NumPy’s - but the kinship
 # between PyTorch and NumPy goes even deeper than that.
-# 
+#
 # If you have existing ML or scientific code with data stored in NumPy
 # ndarrays, you may wish to express that same data as PyTorch tensors,
 # whether to take advantage of PyTorch’s GPU acceleration, or its
 # efficient abstractions for building ML models. It’s easy to switch
 # between ndarrays and PyTorch tensors:
-# 
+#
 
 import numpy as np
 
@@ -914,9 +918,9 @@
 # PyTorch creates a tensor of the same shape and containing the same data
 # as the NumPy array, going so far as to keep NumPy’s default 64-bit float
 # data type.
-# 
+#
 # The conversion can just as easily go the other way:
-# 
+#
 
 pytorch_rand = torch.rand(2, 3)
 print(pytorch_rand)
@@ -929,7 +933,7 @@
 # It is important to know that these converted objects are using *the same
 # underlying memory* as their source objects, meaning that changes to one
 # are reflected in the other:
-# 
+#
 
 numpy_array[1, 1] = 23
 print(pytorch_tensor)