Part 1: calculate gradients
There are two ways of getting gradients:
Backward
x=torch.tensor([3.0], requires_grad=True)
y = torch.pow(x, 2) # y=x**2
y.backward(retain_graph=True)
print(x.grad)6
Grad
x=torch.tensor([3.0], requires_grad=True)
y = torch.pow(x, 2)
grad_1 = torch.autograd.grad(y, x, create_graph=True)
print(grad_1[0].item())
Part 2: Note
(1) gradient will not be cleared unless explicitly cleared
w = torch.tensor([1.], requires_grad=True)
x = torch.tensor([2.], requires_grad=True)
for i in range(4):
a = torch.add(w, x)
b = torch.add(w, 1)
y = torch.mul(a, b)
y.backward()
print(w.grad)
w.grad.zero_()
(2) gradient and gradient function
import torch
w = torch.tensor([1.], requires_grad=True)
x = torch.tensor([2.], requires_grad=True)
# y=(x+w)*(w+1)
a = torch.add(w, x) # retain_grad()
b = torch.add(w, 1)
y = torch.mul(a, b)
y.backward()
print("is_leaf:\n", w.is_leaf, x.is_leaf,
a.is_leaf, b.is_leaf, y.is_leaf)print("gradient:\n", w.grad, x.grad, a.grad, b.grad, y.grad)print("w.grad_fn = ", w.grad_fn)
print("x.grad_fn = ", x.grad_fn)
print("a.grad_fn = ", a.grad_fn)
print("b.grad_fn = ", b.grad_fn)
print("y.grad_fn = ", y.grad_fn)
The result is
is_leaf:
True True False False False
gradient:
tensor([5.]) tensor([2.]) None None None
w.grad_fn = None
x.grad_fn = None
a.grad_fn = <AddBackward0 object at 0x7fe63c2edc40>
b.grad_fn = <AddBackward0 object at 0x7fe63c2edaf0>
y.grad_fn = <MulBackward0 object at 0x7fe63c2edc40>
In the computation graph, if the variable is leaf, then its gradient makes sense, but it does not contain gradient function grad_fun
. When the variable is not a leaf, we can check its gradient function.
(3) Two ways of disabling gradient calculation
Method 1
z = torch.matmul(x, w)+bz_det = z.detach()print(z_det.requires_grad) # False
Method 2
z = torch.matmul(x, w)+b
print(z.requires_grad) #Truewith torch.no_grad():
z = torch.matmul(x, w)+b
print(z.requires_grad) # False
Part 3: Use Gradient
3.1 Regression
import torch
import matplotlib.pyplot as plt
torch.manual_seed(10)lr = 0.05x = torch.rand(20, 1) * 10
y = 2*x + (5 + torch.randn(20, 1))w = torch.randn((1), requires_grad=True)
b = torch.zeros((1), requires_grad=True)for iteration in range(100):
wx = torch.mul(w, x)
y_pred = torch.add(wx, b)
loss = (0.5 * (y - y_pred) ** 2).mean()
loss.backward() b.data.sub_(lr * b.grad)
w.data.sub_(lr * w.grad) w.grad.zero_()
b.grad.zero_() plt.scatter(x.data.numpy(), y.data.numpy())
plt.plot(x.data.numpy(), y_pred.data.numpy(), 'r-',
lw=5) if loss.data.numpy() < 1:
break
see regression code.
3.2 Logistic Regression
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
torch.manual_seed(10)
sample_nums = 100
mean_value = 1.7
bias = 5
n_data = torch.ones(sample_nums, 2)
x0 = torch.normal(mean_value * n_data, 1) + bias
y0 = torch.zeros(sample_nums)
x1 = torch.normal(-mean_value * n_data, 1) + bias
y1 = torch.ones(sample_nums)
train_x = torch.cat((x0, x1), 0)
train_y = torch.cat((y0, y1), 0)class LR(nn.Module):
def __init__(self):
super(LR, self).__init__()
self.features = nn.Linear(2, 1)
self.sigmoid = nn.Sigmoid()def forward(self, x):
x = self.features(x)
x = self.sigmoid(x)
return xlr_net = LR()loss_fn = nn.BCELoss()lr = 0.01
optimizer = torch.optim.SGD(lr_net.parameters(), lr=lr, momentum=0.9)print(lr_net.features.bias)for iteration in range(100):
lr_net.train()
y_pred = lr_net(train_x)
loss = loss_fn(y_pred.squeeze(), train_y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if iteration % 20 == 0:
mask = y_pred.ge(0.5).float().squeeze()
correct = (mask == train_y).sum()
acc = correct.item() / train_y.size(0) plt.scatter(x0.data.numpy()[:, 0], x0.data.numpy()[:, 1], c='r', label='class 0')
plt.scatter(x1.data.numpy()[:, 0], x1.data.numpy()[:, 1], c='b', label='class 1') w0, w1 = lr_net.features.weight[0]
w0, w1 = float(w0.item()), float(w1.item())
plot_b = float(lr_net.features.bias[0].item())
plot_x = np.arange(-6, 6, 0.1)
plot_y = (-w0 * plot_x - plot_b) / w1 plt.xlim(-10, 10)
plt.ylim(-10, 10) plt.plot(plot_x, plot_y)
plt.legend()
plt.show()
plt.pause(0.5)
if acc > 0.99:
break
see logistic regression Python code
Part 4: Jacobian Product
In many cases, we have a scalar loss function, and we need to compute the gradient with respect to some parameters. However, there are cases when the output function is an arbitrary tensor. In this case, PyTorch allows you to compute so-called Jacobian product, and not the actual gradient.
Instead of computing the Jacobian matrix itself, PyTorch allows you to compute Jacobian Product vT⋅JvT⋅J for a given input vector v=(v1…vm)v=(v1…vm). This is achieved by calling backward
with vv as an argument. The size of vv should be the same as the size of the original tensor, with respect to which we want to compute the product:
inp = torch.eye(5, requires_grad=True)
print(inp.shape) # 5 by 5
out = (inp+1).pow(2)
print(out.shape) # 5 by 5
out.backward(torch.ones_like(inp), retain_graph=True)
print("First call\n", inp.grad) # 5 by 5
Part 5: back propagation
(1) manual gradient calculation
(2) automatic gradient calculation via back propagation