diff --git a/beginner_source/introyt/autogradyt_tutorial.py b/beginner_source/introyt/autogradyt_tutorial.py index abf75a7d266..6320af08672 100644 --- a/beginner_source/introyt/autogradyt_tutorial.py +++ b/beginner_source/introyt/autogradyt_tutorial.py @@ -119,7 +119,6 @@ import matplotlib.ticker as ticker import math - ######################################################################### # Next, we’ll create an input tensor full of evenly spaced values on the # interval :math:`[0, 2{\pi}]`, and specify ``requires_grad=True``. (Like @@ -127,16 +126,16 @@ # optional ``requires_grad`` option.) Setting this flag means that in # every computation that follows, autograd will be accumulating the # history of the computation in the output tensors of that computation. -# +# -a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True) +a = torch.linspace(0.0, 2.0 * math.pi, steps=25, requires_grad=True) print(a) ######################################################################## # Next, we’ll perform a computation, and plot its output in terms of its # inputs: -# +# b = torch.sin(a) plt.plot(a.detach(), b.detach()) @@ -145,7 +144,7 @@ ######################################################################## # Let’s have a closer look at the tensor ``b``. When we print it, we see # an indicator that it is tracking its computation history: -# +# print(b) @@ -154,9 +153,9 @@ # This ``grad_fn`` gives us a hint that when we execute the # backpropagation step and compute gradients, we’ll need to compute the # derivative of :math:`\sin(x)` for all this tensor’s inputs. -# +# # Let’s perform some more computations: -# +# c = 2 * b print(c) @@ -170,7 +169,7 @@ # ``.backward()`` on a tensor with no arguments, it expects the calling # tensor to contain only a single element, as is the case when computing a # loss function. -# +# out = d.sum() print(out) @@ -183,19 +182,24 @@ # shows us the gradient functions for all the prior tensors. Note that # ``a.grad_fn`` is reported as ``None``, indicating that this was an input # to the function with no history of its own. -# +# -print('d:') +print("d:") print(d.grad_fn) print(d.grad_fn.next_functions) print(d.grad_fn.next_functions[0][0].next_functions) print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions) -print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions[0][0].next_functions) -print('\nc:') +print( + d.grad_fn.next_functions[0][0] + .next_functions[0][0] + .next_functions[0][0] + .next_functions +) +print("\nc:") print(c.grad_fn) -print('\nb:') +print("\nb:") print(b.grad_fn) -print('\na:') +print("\na:") print(a.grad_fn) @@ -203,7 +207,7 @@ # With all this machinery in place, how do we get derivatives out? You # call the ``backward()`` method on the output, and check the input’s # ``grad`` property to inspect the gradients: -# +# out.backward() print(a.grad) @@ -212,54 +216,56 @@ ######################################################################### # Recall the computation steps we took to get here: -# +# # .. code-block:: python -# +# # a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True) # b = torch.sin(a) # c = 2 * b # d = c + 1 # out = d.sum() -# +# # Adding a constant, as we did to compute ``d``, does not change the # derivative. That leaves :math:`c = 2 * b = 2 * \sin(a)`, the derivative # of which should be :math:`2 * \cos(a)`. Looking at the graph above, # that’s just what we see. -# +# # Be aware that only *leaf nodes* of the computation have their gradients # computed. If you tried, for example, ``print(c.grad)`` you’d get back # ``None``. In this simple example, only the input is a leaf node, so only # it has gradients computed. -# +# # Autograd in Training # -------------------- -# +# # We’ve had a brief look at how autograd works, but how does it look when # it’s used for its intended purpose? Let’s define a small model and # examine how it changes after a single training batch. First, define a # few constants, our model, and some stand-ins for inputs and outputs: -# +# BATCH_SIZE = 16 DIM_IN = 1000 HIDDEN_SIZE = 100 DIM_OUT = 10 + class TinyModel(torch.nn.Module): def __init__(self): super(TinyModel, self).__init__() - + self.layer1 = torch.nn.Linear(DIM_IN, HIDDEN_SIZE) self.relu = torch.nn.ReLU() self.layer2 = torch.nn.Linear(HIDDEN_SIZE, DIM_OUT) - + def forward(self, x): x = self.layer1(x) x = self.relu(x) x = self.layer2(x) return x - + + some_input = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False) ideal_output = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False) @@ -271,12 +277,12 @@ def forward(self, x): # ``requires_grad=True`` for the model’s layers. Within a subclass of # ``torch.nn.Module``, it’s assumed that we want to track gradients on the # layers’ weights for learning. -# +# # If we look at the layers of the model, we can examine the values of the # weights, and verify that no gradients have been computed yet: -# +# -print(model.layer2.weight[0][0:10]) # just a small slice +print(model.layer2.weight[0][0:10]) # just a small slice print(model.layer2.weight.grad) @@ -285,7 +291,7 @@ def forward(self, x): # loss function, we’ll just use the square of the Euclidean distance # between our ``prediction`` and the ``ideal_output``, and we’ll use a # basic stochastic gradient descent optimizer. -# +# optimizer = torch.optim.SGD(model.parameters(), lr=0.001) @@ -297,7 +303,7 @@ def forward(self, x): ###################################################################### # Now, let’s call ``loss.backward()`` and see what happens: -# +# loss.backward() print(model.layer2.weight[0][0:10]) @@ -309,7 +315,7 @@ def forward(self, x): # weight, but the weights remain unchanged, because we haven’t run the # optimizer yet. The optimizer is responsible for updating model weights # based on the computed gradients. -# +# optimizer.step() print(model.layer2.weight[0][0:10]) @@ -318,12 +324,12 @@ def forward(self, x): ###################################################################### # You should see that ``layer2``\ ’s weights have changed. -# +# # One important thing about the process: After calling # ``optimizer.step()``, you need to call ``optimizer.zero_grad()``, or # else every time you run ``loss.backward()``, the gradients on the # learning weights will accumulate: -# +# print(model.layer2.weight.grad[0][0:10]) @@ -331,7 +337,7 @@ def forward(self, x): prediction = model(some_input) loss = (ideal_output - prediction).pow(2).sum() loss.backward() - + print(model.layer2.weight.grad[0][0:10]) optimizer.zero_grad(set_to_none=False) @@ -345,17 +351,17 @@ def forward(self, x): # gradients will be much larger. Failing to zero the gradients before # running your next training batch will cause the gradients to blow up in # this manner, causing incorrect and unpredictable learning results. -# +# # Turning Autograd Off and On # --------------------------- -# +# # There are situations where you will need fine-grained control over # whether autograd is enabled. There are multiple ways to do this, # depending on the situation. -# +# # The simplest is to change the ``requires_grad`` flag on a tensor # directly: -# +# a = torch.ones(2, 3, requires_grad=True) print(a) @@ -374,10 +380,10 @@ def forward(self, x): # a tensor, ``a``, that had autograd turned on. When we turn off autograd # explicitly with ``a.requires_grad = False``, computation history is no # longer tracked, as we see when we compute ``b2``. -# +# # If you only need autograd turned off temporarily, a better way is to use # the ``torch.no_grad()``: -# +# a = torch.ones(2, 3, requires_grad=True) * 2 b = torch.ones(2, 3, requires_grad=True) * 3 @@ -396,11 +402,13 @@ def forward(self, x): ########################################################################## # ``torch.no_grad()`` can also be used as a function or method decorator: -# +# + def add_tensors1(x, y): return x + y + @torch.no_grad() def add_tensors2(x, y): return x + y @@ -420,12 +428,12 @@ def add_tensors2(x, y): # There’s a corresponding context manager, ``torch.enable_grad()``, for # turning autograd on when it isn’t already. It may also be used as a # decorator. -# +# # Finally, you may have a tensor that requires gradient tracking, but you # want a copy that does not. For this we have the ``Tensor`` object’s # ``detach()`` method - it creates a copy of the tensor that is *detached* # from the computation history: -# +# x = torch.rand(5, requires_grad=True) y = x.detach() @@ -440,10 +448,10 @@ def add_tensors2(x, y): # conversion from a PyTorch tensor to a NumPy array is not enabled for # tensors with requires_grad=True. Making a detached copy lets us move # forward. -# +# # Autograd and In-place Operations # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # In every example in this notebook so far, we’ve used variables to # capture the intermediate values of a computation. Autograd needs these # intermediate values to perform gradient computations. *For this reason, @@ -452,32 +460,32 @@ def add_tensors2(x, y): # derivatives in the ``backward()`` call. PyTorch will even stop you if # you attempt an in-place operation on leaf variable that requires # autograd, as shown below. -# +# # .. note:: # The following code cell throws a runtime error. This is expected. -# +# # .. code-block:: python # # a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True) -# torch.sin_(a) +# a.sin_() # ######################################################################### # Autograd Profiler # ----------------- -# +# # Autograd tracks every step of your computation in detail. Such a # computation history, combined with timing information, would make a # handy profiler - and autograd has that feature baked in. Here’s a quick # example usage: -# +# -device = torch.device('cpu') +device = torch.device("cpu") run_on_gpu = False if torch.cuda.is_available(): - device = torch.device('cuda') + device = torch.device("cuda") run_on_gpu = True - + x = torch.randn(2, 3, requires_grad=True) y = torch.rand(2, 3, requires_grad=True) z = torch.ones(2, 3, requires_grad=True) @@ -485,8 +493,8 @@ def add_tensors2(x, y): with torch.autograd.profiler.profile(use_cuda=run_on_gpu) as prf: for _ in range(1000): z = (z / x) * y - -print(prf.key_averages().table(sort_by='self_cpu_time_total')) + +print(prf.key_averages().table(sort_by="self_cpu_time_total")) ########################################################################## @@ -494,15 +502,15 @@ def add_tensors2(x, y): # data by input tensor shape, and export data as a Chrome tracing tools # file. For full details of the API, see the # `documentation `__. -# +# # Advanced Topic: More Autograd Detail and the High-Level API # ----------------------------------------------------------- -# +# # If you have a function with an n-dimensional input and m-dimensional # output, :math:`\vec{y}=f(\vec{x})`, the complete gradient is a matrix of # the derivative of every output with respect to every input, called the # *Jacobian:* -# +# # .. math:: # # J @@ -512,22 +520,22 @@ def add_tensors2(x, y): # \vdots & \ddots & \vdots\\ # \frac{\partial y_{m}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}} # \end{array}\right) -# +# # If you have a second function, :math:`l=g\left(\vec{y}\right)` that # takes m-dimensional input (that is, the same dimensionality as the # output above), and returns a scalar output, you can express its # gradients with respect to :math:`\vec{y}` as a column vector, # :math:`v=\left(\begin{array}{ccc}\frac{\partial l}{\partial y_{1}} & \cdots & \frac{\partial l}{\partial y_{m}}\end{array}\right)^{T}` # - which is really just a one-column Jacobian. -# +# # More concretely, imagine the first function as your PyTorch model (with # potentially many inputs and many outputs) and the second function as a # loss function (with the model’s output as input, and the loss value as # the scalar output). -# +# # If we multiply the first function’s Jacobian by the gradient of the # second function, and apply the chain rule, we get: -# +# # .. math:: # # J^{T}\cdot v=\left(\begin{array}{ccc} @@ -543,24 +551,24 @@ def add_tensors2(x, y): # \vdots\\ # \frac{\partial l}{\partial x_{n}} # \end{array}\right) -# +# # Note: You could also use the equivalent operation :math:`v^{T}\cdot J`, # and get back a row vector. -# +# # The resulting column vector is the *gradient of the second function with # respect to the inputs of the first* - or in the case of our model and # loss function, the gradient of the loss with respect to the model # inputs. -# +# # **``torch.autograd`` is an engine for computing these products.** This # is how we accumulate the gradients over the learning weights during the # backward pass. -# +# # For this reason, the ``backward()`` call can *also* take an optional # vector input. This vector represents a set of gradients over the tensor, # which are multiplied by the Jacobian of the autograd-traced tensor that # precedes it. Let’s try a specific example with a small vector: -# +# x = torch.randn(3, requires_grad=True) @@ -577,9 +585,9 @@ def add_tensors2(x, y): # outputs. For a multi-dimensional output, autograd expects us to provide # gradients for those three outputs that it can multiply into the # Jacobian: -# +# -v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float) # stand-in for gradients +v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float) # stand-in for gradients y.backward(v) print(x.grad) @@ -588,25 +596,27 @@ def add_tensors2(x, y): ########################################################################## # (Note that the output gradients are all related to powers of two - which # we’d expect from a repeated doubling operation.) -# +# # The High-Level API # ~~~~~~~~~~~~~~~~~~ -# +# # There is an API on autograd that gives you direct access to important # differential matrix and vector operations. In particular, it allows you # to calculate the Jacobian and the *Hessian* matrices of a particular # function for particular inputs. (The Hessian is like the Jacobian, but # expresses all partial *second* derivatives.) It also provides methods # for taking vector products with these matrices. -# +# # Let’s take the Jacobian of a simple function, evaluated for a 2 # single-element inputs: -# +# + def exp_adder(x, y): return 2 * x.exp() + 3 * y -inputs = (torch.rand(1), torch.rand(1)) # arguments for the function + +inputs = (torch.rand(1), torch.rand(1)) # arguments for the function print(inputs) torch.autograd.functional.jacobian(exp_adder, inputs) @@ -615,11 +625,11 @@ def exp_adder(x, y): # If you look closely, the first output should equal :math:`2e^x` (since # the derivative of :math:`e^x` is :math:`e^x`), and the second value # should be 3. -# +# # You can, of course, do this with higher-order tensors: -# +# -inputs = (torch.rand(3), torch.rand(3)) # arguments for the function +inputs = (torch.rand(3), torch.rand(3)) # arguments for the function print(inputs) torch.autograd.functional.jacobian(exp_adder, inputs) @@ -628,10 +638,11 @@ def exp_adder(x, y): # The ``torch.autograd.functional.hessian()`` method works identically # (assuming your function is twice differentiable), but returns a matrix # of all second derivatives. -# +# # There is also a function to directly compute the vector-Jacobian # product, if you provide the vector: -# +# + def do_some_doubling(x): y = x * 2 @@ -639,6 +650,7 @@ def do_some_doubling(x): y = y * 2 return y + inputs = torch.randn(3) my_gradients = torch.tensor([0.1, 1.0, 0.0001]) torch.autograd.functional.vjp(do_some_doubling, inputs, v=my_gradients) @@ -648,8 +660,8 @@ def do_some_doubling(x): # The ``torch.autograd.functional.jvp()`` method performs the same matrix # multiplication as ``vjp()`` with the operands reversed. The ``vhp()`` # and ``hvp()`` methods do the same for a vector-Hessian product. -# +# # For more information, including performance notes on the `docs for the # functional # API `__ -# +# diff --git a/beginner_source/introyt/tensors_deeper_tutorial.py b/beginner_source/introyt/tensors_deeper_tutorial.py index 4d118ad4030..0f20733148a 100644 --- a/beginner_source/introyt/tensors_deeper_tutorial.py +++ b/beginner_source/introyt/tensors_deeper_tutorial.py @@ -30,13 +30,12 @@ import torch import math - ######################################################################### # Creating Tensors # ---------------- -# +# # The simplest way to create a tensor is with the ``torch.empty()`` call: -# +# x = torch.empty(3, 4) print(type(x)) @@ -45,7 +44,7 @@ ########################################################################## # Let’s upack what we just did: -# +# # - We created a tensor using one of the numerous factory methods # attached to the ``torch`` module. # - The tensor itself is 2-dimensional, having 3 rows and 4 columns. @@ -57,21 +56,21 @@ # tensor. The ``torch.empty()`` call allocates memory for the tensor, # but does not initialize it with any values - so what you’re seeing is # whatever was in memory at the time of allocation. -# +# # A brief note about tensors and their number of dimensions, and # terminology: -# +# # - You will sometimes see a 1-dimensional tensor called a -# *vector.* +# *vector.* # - Likewise, a 2-dimensional tensor is often referred to as a -# *matrix.* +# *matrix.* # - Anything with more than two dimensions is generally just # called a tensor. -# +# # More often than not, you’ll want to initialize your tensor with some # value. Common cases are all zeros, all ones, or random values, and the # ``torch`` module provides factory methods for all of these: -# +# zeros = torch.zeros(2, 3) print(zeros) @@ -88,10 +87,10 @@ # The factory methods all do just what you’d expect - we have a tensor # full of zeros, another full of ones, and another with random values # between 0 and 1. -# +# # Random Tensors and Seeding # ~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # Speaking of the random tensor, did you notice the call to # ``torch.manual_seed()`` immediately preceding it? Initializing tensors, # such as a model’s learning weights, with random values is common but @@ -99,7 +98,7 @@ # some assurance of the reproducibility of your results. Manually setting # your random number generator’s seed is the way to do this. Let’s look # more closely: -# +# torch.manual_seed(1729) random1 = torch.rand(2, 3) @@ -121,18 +120,18 @@ # identical values, as do ``random2`` and ``random4``. Manually setting # the RNG’s seed resets it, so that identical computations depending on # random number should, in most settings, provide identical results. -# +# # For more information, see the `PyTorch documentation on # reproducibility `__. -# +# # Tensor Shapes # ~~~~~~~~~~~~~ -# +# # Often, when you’re performing operations on two or more tensors, they # will need to be of the same *shape* - that is, having the same number of # dimensions and the same number of cells in each dimension. For that, we # have the ``torch.*_like()`` methods: -# +# x = torch.empty(2, 2, 3) print(x.shape) @@ -160,15 +159,15 @@ # property on a tensor. This property contains a list of the extent of # each dimension of a tensor - in our case, ``x`` is a three-dimensional # tensor with shape 2 x 2 x 3. -# +# # Below that, we call the ``.empty_like()``, ``.zeros_like()``, # ``.ones_like()``, and ``.rand_like()`` methods. Using the ``.shape`` # property, we can verify that each of these methods returns a tensor of # identical dimensionality and extent. -# +# # The last way to create a tensor that will cover is to specify its data # directly from a PyTorch collection: -# +# some_constants = torch.tensor([[3.1415926, 2.71828], [1.61803, 0.0072897]]) print(some_constants) @@ -185,20 +184,20 @@ # tensor if you already have data in a Python tuple or list. As shown # above, nesting the collections will result in a multi-dimensional # tensor. -# +# # .. note:: # ``torch.tensor()`` creates a copy of the data. -# +# # Tensor Data Types # ~~~~~~~~~~~~~~~~~ -# +# # Setting the datatype of a tensor is possible a couple of ways: -# +# a = torch.ones((2, 3), dtype=torch.int16) print(a) -b = torch.rand((2, 3), dtype=torch.float64) * 20. +b = torch.rand((2, 3), dtype=torch.float64) * 20.0 print(b) c = b.to(torch.int32) @@ -211,40 +210,40 @@ # we set ``dtype=torch.int16`` for the tensor ``a``. When we print ``a``, # we can see that it’s full of ``1`` rather than ``1.`` - Python’s subtle # cue that this is an integer type rather than floating point. -# +# # Another thing to notice about printing ``a`` is that, unlike when we # left ``dtype`` as the default (32-bit floating point), printing the # tensor also specifies its ``dtype``. -# +# # You may have also spotted that we went from specifying the tensor’s # shape as a series of integer arguments, to grouping those arguments in a # tuple. This is not strictly necessary - PyTorch will take a series of # initial, unlabeled integer arguments as a tensor shape - but when adding # the optional arguments, it can make your intent more readable. -# +# # The other way to set the datatype is with the ``.to()`` method. In the # cell above, we create a random floating point tensor ``b`` in the usual # way. Following that, we create ``c`` by converting ``b`` to a 32-bit # integer with the ``.to()`` method. Note that ``c`` contains all the same # values as ``b``, but truncated to integers. -# +# # For more information, see the `data types documentation `__. -# +# # Math & Logic with PyTorch Tensors # --------------------------------- -# +# # Now that you know some of the ways to create a tensor… what can you do # with them? -# +# # Let’s look at basic arithmetic first, and how tensors interact with # simple scalars: -# +# ones = torch.zeros(2, 2) + 1 twos = torch.ones(2, 2) * 2 threes = (torch.ones(2, 2) * 7 - 1) / 2 -fours = twos ** 2 -sqrt2s = twos ** 0.5 +fours = twos**2 +sqrt2s = twos**0.5 print(ones) print(twos) @@ -260,10 +259,10 @@ # the output of such an operation will be a tensor, you can chain them # together with the usual operator precedence rules, as in the line where # we create ``threes``. -# +# # Similar operations between two tensors also behave like you’d # intuitively expect: -# +# powers2 = twos ** torch.tensor([[1, 2], [3, 4]]) print(powers2) @@ -279,7 +278,7 @@ # It’s important to note here that all of the tensors in the previous code # cell were of identical shape. What happens when we try to perform a # binary operation on tensors if dissimilar shape? -# +# # .. note:: # The following cell throws a run-time error. This is intentional. # @@ -296,17 +295,17 @@ # In the general case, you cannot operate on tensors of different shape # this way, even in a case like the cell above, where the tensors have an # identical number of elements. -# +# # In Brief: Tensor Broadcasting # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # .. note:: # If you are familiar with broadcasting semantics in NumPy # ndarrays, you’ll find the same rules apply here. -# +# # The exception to the same-shapes rule is *tensor broadcasting.* Here’s # an example: -# +# rand = torch.rand(2, 4) doubled = rand * (torch.ones(1, 4) * 2) @@ -318,66 +317,66 @@ ######################################################################### # What’s the trick here? How is it we got to multiply a 2x4 tensor by a # 1x4 tensor? -# +# # Broadcasting is a way to perform an operation between tensors that have # similarities in their shapes. In the example above, the one-row, # four-column tensor is multiplied by *both rows* of the two-row, # four-column tensor. -# +# # This is an important operation in Deep Learning. The common example is # multiplying a tensor of learning weights by a *batch* of input tensors, # applying the operation to each instance in the batch separately, and # returning a tensor of identical shape - just like our (2, 4) \* (1, 4) # example above returned a tensor of shape (2, 4). -# +# # The rules for broadcasting are: -# +# # - Each tensor must have at least one dimension - no empty tensors. -# +# # - Comparing the dimension sizes of the two tensors, *going from last to # first:* -# +# # - Each dimension must be equal, *or* -# +# # - One of the dimensions must be of size 1, *or* -# +# # - The dimension does not exist in one of the tensors -# +# # Tensors of identical shape, of course, are trivially “broadcastable”, as # you saw earlier. -# +# # Here are some examples of situations that honor the above rules and # allow broadcasting: -# +# -a = torch.ones(4, 3, 2) +a = torch.ones(4, 3, 2) -b = a * torch.rand( 3, 2) # 3rd & 2nd dims identical to a, dim 1 absent +b = a * torch.rand(3, 2) # 3rd & 2nd dims identical to a, dim 1 absent print(b) -c = a * torch.rand( 3, 1) # 3rd dim = 1, 2nd dim identical to a +c = a * torch.rand(3, 1) # 3rd dim = 1, 2nd dim identical to a print(c) -d = a * torch.rand( 1, 2) # 3rd dim identical to a, 2nd dim = 1 +d = a * torch.rand(1, 2) # 3rd dim identical to a, 2nd dim = 1 print(d) ############################################################################# -# Look closely at the values of each tensor above: +# Look closely at the values of each tensor above: # -# - The multiplication operation that created ``b`` was +# - The multiplication operation that created ``b`` was # broadcast over every “layer” of ``a``. # - For ``c``, the operation was broadcast over every layer and row of -# ``a`` - every 3-element column is identical. +# ``a`` - every 3-element column is identical. # - For ``d``, we switched it around - now every *row* is identical, # across layers and columns. -# +# # For more information on broadcasting, see the `PyTorch # documentation `__ # on the topic. -# +# # Here are some examples of attempts at broadcasting that will fail: -# +# # .. note:: # The following cell throws a run-time error. This is intentional. # @@ -396,16 +395,16 @@ ########################################################################### # More Math with Tensors # ~~~~~~~~~~~~~~~~~~~~~~ -# +# # PyTorch tensors have over three hundred operations that can be performed # on them. -# +# # Here is a small sample from some of the major categories of operations: -# +# # common functions a = torch.rand(2, 4) * 2 - 1 -print('Common functions:') +print("Common functions:") print(torch.abs(a)) print(torch.ceil(a)) print(torch.floor(a)) @@ -415,44 +414,44 @@ angles = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4]) sines = torch.sin(angles) inverses = torch.asin(sines) -print('\nSine and arcsine:') +print("\nSine and arcsine:") print(angles) print(sines) print(inverses) # bitwise operations -print('\nBitwise XOR:') +print("\nBitwise XOR:") b = torch.tensor([1, 5, 11]) c = torch.tensor([2, 7, 10]) print(torch.bitwise_xor(b, c)) # comparisons: -print('\nBroadcasted, element-wise equality comparison:') -d = torch.tensor([[1., 2.], [3., 4.]]) +print("\nBroadcasted, element-wise equality comparison:") +d = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) e = torch.ones(1, 2) # many comparison ops support broadcasting! -print(torch.eq(d, e)) # returns a tensor of type bool +print(torch.eq(d, e)) # returns a tensor of type bool # reductions: -print('\nReduction ops:') -print(torch.max(d)) # returns a single-element tensor -print(torch.max(d).item()) # extracts the value from the returned tensor -print(torch.mean(d)) # average -print(torch.std(d)) # standard deviation -print(torch.prod(d)) # product of all numbers -print(torch.unique(torch.tensor([1, 2, 1, 2, 1, 2]))) # filter unique elements +print("\nReduction ops:") +print(torch.max(d)) # returns a single-element tensor +print(torch.max(d).item()) # extracts the value from the returned tensor +print(torch.mean(d)) # average +print(torch.std(d)) # standard deviation +print(torch.prod(d)) # product of all numbers +print(torch.unique(torch.tensor([1, 2, 1, 2, 1, 2]))) # filter unique elements # vector and linear algebra operations -v1 = torch.tensor([1., 0., 0.]) # x unit vector -v2 = torch.tensor([0., 1., 0.]) # y unit vector -m1 = torch.rand(2, 2) # random matrix -m2 = torch.tensor([[3., 0.], [0., 3.]]) # three times identity matrix +v1 = torch.tensor([1.0, 0.0, 0.0]) # x unit vector +v2 = torch.tensor([0.0, 1.0, 0.0]) # y unit vector +m1 = torch.rand(2, 2) # random matrix +m2 = torch.tensor([[3.0, 0.0], [0.0, 3.0]]) # three times identity matrix -print('\nVectors & Matrices:') -print(torch.linalg.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1) +print("\nVectors & Matrices:") +print(torch.linalg.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1) print(m1) -m3 = torch.linalg.matmul(m1, m2) -print(m3) # 3 times m1 -print(torch.linalg.svd(m3)) # singular value decomposition +m3 = torch.matmul(m1, m2) +print(m3) # 3 times m1 +print(torch.linalg.svd(m3)) # singular value decomposition ################################################################################## @@ -461,69 +460,70 @@ # `documentation `__. # For more details and the full inventory of linear algebra operations, have a # look at this `documentation `__. -# +# # Altering Tensors in Place # ~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # Most binary operations on tensors will return a third, new tensor. When # we say ``c = a * b`` (where ``a`` and ``b`` are tensors), the new tensor # ``c`` will occupy a region of memory distinct from the other tensors. -# +# # There are times, though, that you may wish to alter a tensor in place - # for example, if you’re doing an element-wise computation where you can # discard intermediate values. For this, most of the math functions have a # version with an appended underscore (``_``) that will alter a tensor in # place. -# +# # For example: -# +# a = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4]) -print('a:') +print("a:") print(a) -print(torch.sin(a)) # this operation creates a new tensor in memory -print(a) # a has not changed +print(torch.sin(a)) # this operation creates a new tensor in memory +print(a) # a has not changed b = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4]) -print('\nb:') +print("\nb:") print(b) -print(torch.sin_(b)) # note the underscore -print(b) # b has changed +print(b.sin_()) # note the underscore +print(b) # b has changed ####################################################################### # For arithmetic operations, there are functions that behave similarly: -# +# a = torch.ones(2, 2) b = torch.rand(2, 2) -print('Before:') +print("Before:") print(a) print(b) -print('\nAfter adding:') +print("\nAfter adding:") print(a.add_(b)) print(a) print(b) -print('\nAfter multiplying') +print("\nAfter multiplying") print(b.mul_(b)) print(b) ########################################################################## # Note that these in-place arithmetic functions are methods on the -# ``torch.Tensor`` object, not attached to the ``torch`` module like many -# other functions (e.g., ``torch.sin()``). As you can see from -# ``a.add_(b)``, *the calling tensor is the one that gets changed in -# place.* -# +# ``torch.Tensor`` object. As you can see from ``a.add_(b)``, *the calling +# tensor is the one that gets changed in place.* Similarly, the in-place +# trigonometric functions like ``sin_()`` are also tensor methods rather than +# module-level functions. +# +# # There is another option for placing the result of a computation in an # existing, allocated tensor. Many of the methods and functions we’ve seen # so far - including creation methods! - have an ``out`` argument that # lets you specify a tensor to receive the output. If the ``out`` tensor # is the correct shape and ``dtype``, this can happen without a new memory # allocation: -# +# a = torch.rand(2, 2) b = torch.rand(2, 2) @@ -532,44 +532,44 @@ print(c) d = torch.matmul(a, b, out=c) -print(c) # contents of c have changed +print(c) # contents of c have changed -assert c is d # test c & d are same object, not just containing equal values +assert c is d # test c & d are same object, not just containing equal values assert id(c) == old_id # make sure that our new c is the same object as the old one -torch.rand(2, 2, out=c) # works for creation too! -print(c) # c has changed again +torch.rand(2, 2, out=c) # works for creation too! +print(c) # c has changed again assert id(c) == old_id # still the same object! ########################################################################## # Copying Tensors # --------------- -# +# # As with any object in Python, assigning a tensor to a variable makes the # variable a *label* of the tensor, and does not copy it. For example: -# +# a = torch.ones(2, 2) b = a a[0][1] = 561 # we change a... -print(b) # ...and b is also altered +print(b) # ...and b is also altered ###################################################################### # But what if you want a separate copy of the data to work on? The # ``clone()`` method is there for you: -# +# a = torch.ones(2, 2) b = a.clone() -assert b is not a # different objects in memory... +assert b is not a # different objects in memory... print(torch.eq(a, b)) # ...but still with the same contents! -a[0][1] = 561 # a changes... -print(b) # ...but b is still all ones +a[0][1] = 561 # a changes... +print(b) # ...but b is still all ones ######################################################################### @@ -577,7 +577,7 @@ # If your source tensor has autograd, enabled then so will the clone. # **This will be covered more deeply in the video on autograd,** but if # you want the light version of the details, continue on. -# +# # *In many cases, this will be what you want.* For example, if your model # has multiple computation paths in its ``forward()`` method, and *both* # the original tensor and its clone contribute to the model’s output, then @@ -585,11 +585,11 @@ # If your source tensor has autograd enabled (which it generally will if # it’s a set of learning weights or derived from a computation involving # the weights), then you’ll get the result you want. -# +# # On the other hand, if you’re doing a computation where *neither* the # original tensor nor its clone need to track gradients, then as long as # the source tensor has autograd turned off, you’re good to go. -# +# # *There is a third case,* though: Imagine you’re performing a computation # in your model’s ``forward()`` function, where gradients are turned on # for everything by default, but you want to pull out some values @@ -597,9 +597,9 @@ # cloned copy of your source tensor to track gradients - performance is # improved with autograd’s history tracking turned off. For this, you can # use the ``.detach()`` method on the source tensor: -# +# -a = torch.rand(2, 2, requires_grad=True) # turn on autograd +a = torch.rand(2, 2, requires_grad=True) # turn on autograd print(a) b = a.clone() @@ -613,7 +613,7 @@ ######################################################################### # What’s happening here? -# +# # - We create ``a`` with ``requires_grad=True`` turned on. **We haven’t # covered this optional argument yet, but will during the unit on # autograd.** @@ -626,33 +626,33 @@ # - We clone ``a`` into ``c``, but we call ``detach()`` first. # - Printing ``c``, we see no computation history, and no # ``requires_grad=True``. -# +# # The ``detach()`` method *detaches the tensor from its computation # history.* It says, “do whatever comes next as if autograd was off.” It # does this *without* changing ``a`` - you can see that when we print # ``a`` again at the end, it retains its ``requires_grad=True`` property. -# +# # Moving to `Accelerator `__ # ------------- -# +# # One of the major advantages of PyTorch is its robust acceleration on an # `accelerator `__ -# such as CUDA, MPS, MTIA, or XPU. +# such as CUDA, MPS, MTIA, or XPU. # So far, everything we’ve done has been on CPU. How do we move to the faster # hardware? -# +# # First, we should check whether an accelerator is available, with the # ``is_available()`` method. -# +# # .. note:: # If you do not have an accelerator, the executable cells in this section will not execute any # accelerator-related code. -# +# if torch.accelerator.is_available(): - print('We have an accelerator!') + print("We have an accelerator!") else: - print('Sorry, CPU only.') + print("Sorry, CPU only.") ########################################################################## @@ -663,16 +663,16 @@ # move *all* the data needed for that computation to memory accessible by # that device. (Colloquially, “moving the data to memory accessible by the # GPU” is shorted to, “moving the data to the GPU”.) -# +# # There are multiple ways to get your data onto your target device. You # may do it at creation time: -# +# if torch.accelerator.is_available(): gpu_rand = torch.rand(2, 2, device=torch.accelerator.current_accelerator()) print(gpu_rand) else: - print('Sorry, CPU only.') + print("Sorry, CPU only.") ########################################################################## @@ -680,20 +680,24 @@ # when we want to create our tensor on the accelerator with the optional # ``device`` argument. You can see when we print the new tensor, PyTorch # informs us which device it’s on (if it’s not on CPU). -# +# # You can query the number of accelerators with ``torch.accelerator.device_count()``. If # you have more than one accelerator, you can specify them by index, take CUDA for example: # ``device='cuda:0'``, ``device='cuda:1'``, etc. -# +# # As a coding practice, specifying our devices everywhere with string # constants is pretty fragile. In an ideal world, your code would perform # robustly whether you’re on CPU or accelerator hardware. You can do this by # creating a device handle that can be passed to your tensors instead of a # string: -# +# -my_device = torch.accelerator.current_accelerator() if torch.accelerator.is_available() else torch.device('cpu') -print('Device: {}'.format(my_device)) +my_device = ( + torch.accelerator.current_accelerator() + if torch.accelerator.is_available() + else torch.device("cpu") +) +print(f"Device: {my_device}") x = torch.rand(2, 2, device=my_device) print(x) @@ -704,7 +708,7 @@ # another with the ``to()`` method. The following line of code creates a # tensor on CPU, and moves it to whichever device handle you acquired in # the previous cell. -# +# y = torch.rand(2, 2) y = y.to(my_device) @@ -715,35 +719,35 @@ # more tensors, *all of the tensors must be on the same device*. The # following code will throw a runtime error, regardless of whether you # have an accelerator device available, take CUDA for example: -# +# # .. code-block:: python -# +# # x = torch.rand(2, 2) # y = torch.rand(2, 2, device='cuda') # z = x + y # exception will be thrown -# +# ########################################################################### # Manipulating Tensor Shapes # -------------------------- -# +# # Sometimes, you’ll need to change the shape of your tensor. Below, we’ll # look at a few common cases, and how to handle them. -# +# # Changing the Number of Dimensions # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # One case where you might need to change the number of dimensions is # passing a single instance of input to your model. PyTorch models # generally expect *batches* of input. -# +# # For example, imagine having a model that works on 3 x 226 x 226 images - # a 226-pixel square with 3 color channels. When you load and transform # it, you’ll get a tensor of shape ``(3, 226, 226)``. Your model, though, # is expecting input of shape ``(N, 3, 226, 226)``, where ``N`` is the # number of images in the batch. So how do you make a batch of one? -# +# a = torch.rand(3, 226, 226) b = a.unsqueeze(0) @@ -756,11 +760,11 @@ # The ``unsqueeze()`` method adds a dimension of extent 1. # ``unsqueeze(0)`` adds it as a new zeroth dimension - now you have a # batch of one! -# +# # So if that’s *un*\ squeezing? What do we mean by squeezing? We’re taking # advantage of the fact that any dimension of extent 1 *does not* change # the number of elements in the tensor. -# +# c = torch.rand(1, 1, 1, 1, 1) print(c) @@ -772,10 +776,10 @@ # have shape ``(N, 20)``, where ``N`` is the number of instances in the # input batch. That means that for our single-input batch, we’ll get an # output of shape ``(1, 20)``. -# +# # What if you want to do some *non-batched* computation with that output - # something that’s just expecting a 20-element vector? -# +# a = torch.rand(1, 20) print(a.shape) @@ -797,44 +801,44 @@ # 1-dimensional, and if you look closely at the output of the cell above # you’ll see that printing ``a`` shows an “extra” set of square brackets # ``[]`` due to having an extra dimension. -# +# # You may only ``squeeze()`` dimensions of extent 1. See above where we # try to squeeze a dimension of size 2 in ``c``, and get back the same # shape we started with. Calls to ``squeeze()`` and ``unsqueeze()`` can # only act on dimensions of extent 1 because to do otherwise would change # the number of elements in the tensor. -# +# # Another place you might use ``unsqueeze()`` is to ease broadcasting. # Recall the example above where we had the following code: -# +# # .. code-block:: python -# +# # a = torch.ones(4, 3, 2) -# +# # c = a * torch.rand( 3, 1) # 3rd dim = 1, 2nd dim identical to a # print(c) -# +# # The net effect of that was to broadcast the operation over dimensions 0 # and 2, causing the random, 3 x 1 tensor to be multiplied element-wise by # every 3-element column in ``a``. -# +# # What if the random vector had just been 3-element vector? We’d lose the # ability to do the broadcast, because the final dimensions would not # match up according to the broadcasting rules. ``unsqueeze()`` comes to # the rescue: -# +# a = torch.ones(4, 3, 2) -b = torch.rand( 3) # trying to multiply a * b will give a runtime error -c = b.unsqueeze(1) # change to a 2-dimensional tensor, adding new dim at the end +b = torch.rand(3) # trying to multiply a * b will give a runtime error +c = b.unsqueeze(1) # change to a 2-dimensional tensor, adding new dim at the end print(c.shape) -print(a * c) # broadcasting works again! +print(a * c) # broadcasting works again! ###################################################################### # The ``squeeze()`` and ``unsqueeze()`` methods also have in-place # versions, ``squeeze_()`` and ``unsqueeze_()``: -# +# batch_me = torch.rand(3, 226, 226) print(batch_me.shape) @@ -852,7 +856,7 @@ # layer expects a 1-dimensional input. ``reshape()`` will do this for you, # provided that the dimensions you request yield the same number of # elements as the input tensor has: -# +# output3d = torch.rand(6, 20, 20) print(output3d.shape) @@ -872,34 +876,34 @@ # lets us cheat and just use a series of integers. Here, we had to add the # parentheses and comma to convince the method that this is really a # one-element tuple. -# +# # When it can, ``reshape()`` will return a *view* on the tensor to be # changed - that is, a separate tensor object looking at the same # underlying region of memory. *This is important:* That means any change # made to the source tensor will be reflected in the view on that tensor, # unless you ``clone()`` it. -# +# # There *are* conditions, beyond the scope of this introduction, where # ``reshape()`` has to return a tensor carrying a copy of the data. For # more information, see the # `docs `__. -# +# ####################################################################### # NumPy Bridge # ------------ -# +# # In the section above on broadcasting, it was mentioned that PyTorch’s # broadcast semantics are compatible with NumPy’s - but the kinship # between PyTorch and NumPy goes even deeper than that. -# +# # If you have existing ML or scientific code with data stored in NumPy # ndarrays, you may wish to express that same data as PyTorch tensors, # whether to take advantage of PyTorch’s GPU acceleration, or its # efficient abstractions for building ML models. It’s easy to switch # between ndarrays and PyTorch tensors: -# +# import numpy as np @@ -914,9 +918,9 @@ # PyTorch creates a tensor of the same shape and containing the same data # as the NumPy array, going so far as to keep NumPy’s default 64-bit float # data type. -# +# # The conversion can just as easily go the other way: -# +# pytorch_rand = torch.rand(2, 3) print(pytorch_rand) @@ -929,7 +933,7 @@ # It is important to know that these converted objects are using *the same # underlying memory* as their source objects, meaning that changes to one # are reflected in the other: -# +# numpy_array[1, 1] = 23 print(pytorch_tensor)