PyTorch Gradients

3 min readMay 17, 2021

Part 1: calculate gradients

There are two ways of getting gradients:

Backward

x=torch.tensor([3.0], requires_grad=True)
y = torch.pow(x, 2) # y=x**2
y.backward(retain_graph=True)
print(x.grad)6

Grad

x=torch.tensor([3.0], requires_grad=True)
y = torch.pow(x, 2)
grad_1 = torch.autograd.grad(y, x, create_graph=True)
print(grad_1[0].item())

Part 2: Note

(1) gradient will not be cleared unless explicitly cleared

w = torch.tensor([1.], requires_grad=True)
x = torch.tensor([2.], requires_grad=True)
     
for i in range(4):
        a = torch.add(w, x)
        b = torch.add(w, 1)
        y = torch.mul(a, b)
        y.backward()
        print(w.grad)
        w.grad.zero_()

(2) gradient and gradient function

import torch
w = torch.tensor([1.], requires_grad=True)
x = torch.tensor([2.], requires_grad=True)
# y=(x+w)*(w+1)
a = torch.add(w, x)     # retain_grad()
b = torch.add(w, 1)
y = torch.mul(a, b)
 
y.backward()
 print("is_leaf:\n", w.is_leaf, x.is_leaf, 
      a.is_leaf, b.is_leaf, y.is_leaf)print("gradient:\n", w.grad, x.grad, a.grad, b.grad, y.grad)print("w.grad_fn = ", w.grad_fn)
print("x.grad_fn = ", x.grad_fn)
print("a.grad_fn = ", a.grad_fn)
print("b.grad_fn = ", b.grad_fn)
print("y.grad_fn = ", y.grad_fn)

The result is

is_leaf:
 True True False False False
gradient:
 tensor([5.]) tensor([2.]) None None None
w.grad_fn =  None
x.grad_fn =  None
a.grad_fn =  <AddBackward0 object at 0x7fe63c2edc40>
b.grad_fn =  <AddBackward0 object at 0x7fe63c2edaf0>
y.grad_fn =  <MulBackward0 object at 0x7fe63c2edc40>

In the computation graph, if the variable is leaf, then its gradient makes sense, but it does not contain gradient function grad_fun . When the variable is not a leaf, we can check its gradient function.

(3) Two ways of disabling gradient calculation

Method 1

z = torch.matmul(x, w)+bz_det = z.detach()print(z_det.requires_grad) # False

Method 2

z = torch.matmul(x, w)+b
print(z.requires_grad) #Truewith torch.no_grad():
   z = torch.matmul(x, w)+b
print(z.requires_grad) # False

Part 3: Use Gradient

3.1 Regression

import torch
import matplotlib.pyplot as plt
torch.manual_seed(10)lr = 0.05x = torch.rand(20, 1) * 10  
y = 2*x + (5 + torch.randn(20, 1))w = torch.randn((1), requires_grad=True) 
b = torch.zeros((1), requires_grad=True)for iteration in range(100):
 
    wx = torch.mul(w, x)
    y_pred = torch.add(wx, b)
    loss = (0.5 * (y - y_pred) ** 2).mean()
 
    loss.backward()    b.data.sub_(lr * b.grad)
    w.data.sub_(lr * w.grad)    w.grad.zero_()
    b.grad.zero_()     plt.scatter(x.data.numpy(), y.data.numpy())
    plt.plot(x.data.numpy(), y_pred.data.numpy(), 'r-',
                  lw=5)           if loss.data.numpy() < 1:
            break

see regression code.

3.2 Logistic Regression

import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
torch.manual_seed(10)
sample_nums = 100
mean_value = 1.7
bias = 5
n_data = torch.ones(sample_nums, 2)
 
x0 = torch.normal(mean_value * n_data, 1) + bias      
y0 = torch.zeros(sample_nums)                      
x1 = torch.normal(-mean_value * n_data, 1) + bias      
y1 = torch.ones(sample_nums)                         
train_x = torch.cat((x0, x1), 0)
train_y = torch.cat((y0, y1), 0)class LR(nn.Module):
    def __init__(self):
        super(LR, self).__init__()
        self.features = nn.Linear(2, 1)
        self.sigmoid = nn.Sigmoid()def forward(self, x):
        x = self.features(x)
        x = self.sigmoid(x)
        return xlr_net = LR()loss_fn = nn.BCELoss()lr = 0.01   
optimizer = torch.optim.SGD(lr_net.parameters(), lr=lr, momentum=0.9)print(lr_net.features.bias)for iteration in range(100):
    lr_net.train()    
    y_pred = lr_net(train_x)
  
    loss = loss_fn(y_pred.squeeze(), train_y)
    
    loss.backward()
    
    optimizer.step()
 
    optimizer.zero_grad()
    
    if iteration % 20 == 0:
        mask = y_pred.ge(0.5).float().squeeze()   
        correct = (mask == train_y).sum() 
        acc = correct.item() / train_y.size(0)        plt.scatter(x0.data.numpy()[:, 0], x0.data.numpy()[:, 1], c='r', label='class 0')
        plt.scatter(x1.data.numpy()[:, 0], x1.data.numpy()[:, 1], c='b', label='class 1')        w0, w1 = lr_net.features.weight[0]
        w0, w1 = float(w0.item()), float(w1.item())
        plot_b = float(lr_net.features.bias[0].item())
        plot_x = np.arange(-6, 6, 0.1)
        plot_y = (-w0 * plot_x - plot_b) / w1        plt.xlim(-10, 10)
        plt.ylim(-10, 10)        plt.plot(plot_x, plot_y) 
        plt.legend()
        
        plt.show()
        plt.pause(0.5)
        
        if acc > 0.99:
            break

see logistic regression Python code

Part 4: Jacobian Product

In many cases, we have a scalar loss function, and we need to compute the gradient with respect to some parameters. However, there are cases when the output function is an arbitrary tensor. In this case, PyTorch allows you to compute so-called Jacobian product, and not the actual gradient.

Instead of computing the Jacobian matrix itself, PyTorch allows you to compute Jacobian Product vT⋅JvT⋅J for a given input vector v=(v1…vm)v=(v1…vm). This is achieved by calling backward with vv as an argument. The size of vv should be the same as the size of the original tensor, with respect to which we want to compute the product:

inp = torch.eye(5, requires_grad=True)
print(inp.shape) # 5 by 5
out = (inp+1).pow(2)
print(out.shape) # 5 by 5
out.backward(torch.ones_like(inp), retain_graph=True)
print("First call\n", inp.grad) # 5 by 5

Part 5: back propagation

(1) manual gradient calculation

code

(2) automatic gradient calculation via back propagation