lau*_*ers 5 python regression uncertainty deep-learning pytorch
我正在尝试根据 pytorch 实现具有任意不确定性估计回归的神经网络
Kendall 等人:“计算机视觉的贝叶斯深度学习需要哪些不确定性?” (关联)。
然而,虽然预测的回归值非常适合所需的真实值,但预测的方差看起来很奇怪,并且损失在训练期间变为负值。
该论文建议有两个输出均值和方差,而不是仅预测回归值。更准确地说,由于稳定性原因,建议预测均值和对数(方差)。因此,我的网络如下所示:
class ReferenceResNet(nn.Module):
def __init__(self):
super().__init__()
self.fcl1 = nn.Linear(1, 32)
self.fcl2 = nn.Linear(32, 64)
self.fcl3 = nn.Linear(64, 128)
self.fcl_mean = nn.Linear(128,1)
self.fcl_var = nn.Linear(128,1)
def forward(self, x):
x = torch.tanh(self.fcl1(x))
x = torch.tanh(self.fcl2(x))
x = torch.tanh(self.fcl3(x))
mean = self.fcl_mean(x)
log_var = self.fcl_var(x)
return mean, log_var
Run Code Online (Sandbox Code Playgroud)
根据该论文,给定这些输出,相应的损失函数由残差回归部分和正则化项组成:
其中 si 是网络预测的对数(方差)。
我相应地实现了这个损失函数:
def loss_function(pred_mean, pred_log_var, y):
return 1/len(pred_mean)*(0.5 * torch.exp(-pred_log_var)*torch.sqrt(torch.pow(y-pred_mean, 2))+0.5*pred_log_var).sum()
Run Code Online (Sandbox Code Playgroud)
我在自生成的玩具数据集上尝试了此代码(请参见带有结果的图像),但是,在训练期间损失变为负值,并且当我在训练后绘制数据集上的方差时,对我来说,它并没有真正意义,而相应的平均值值非常符合基本事实:
我已经发现负损失来自正则化项,因为对数对于 0 到 1 之间的值是负数,但是,我不认为正则化项的绝对值应该比回归部分增长得更大。有谁知道这是什么原因以及我如何防止这种情况发生?为什么我的方差看起来如此奇怪? 为了重现,我的完整代码如下所示:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data.dataset import TensorDataset
from torchvision import datasets, transforms
import math
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class ReferenceRegNet(nn.Module):
def __init__(self):
super().__init__()
self.fcl1 = nn.Linear(1, 32)
self.fcl2 = nn.Linear(32, 64)
self.fcl3 = nn.Linear(64, 128)
self.fcl_mean = nn.Linear(128,1)
self.fcl_var = nn.Linear(128,1)
def forward(self, x):
x = torch.tanh(self.fcl1(x))
x = torch.tanh(self.fcl2(x))
x = torch.tanh(self.fcl3(x))
mean = self.fcl_mean(x)
log_var = self.fcl_var(x)
return mean, log_var
def toy_function(x):
return math.sin(x/15-4)+2 + math.sin(x/10-5)
def loss_function(x_mean, x_log_var, y):
return 1/len(x_mean)*(0.5 * torch.exp(-x_log_var)*torch.sqrt(torch.pow(y-x_mean, 2))+0.5*x_log_var).sum()
BATCH_SIZE = 10
EVAL_BATCH_SIZE = 10
CLASSES = 1
TRAIN_EPOCHS = 50
# generate toy dataset: A train-set in form of a complex sin-curve
x_train_data = np.array([])
y_train_data = np.array([])
for repeat in range(2):
for i in range(50, 150):
for j in range(100):
sampled_x = i+np.random.randint(101)/100
sampled_y = toy_function(sampled_x)+np.random.normal(0,0.2)
x_train_data = np.append(x_train_data, sampled_x)
y_train_data = np.append(y_train_data, sampled_y)
x_eval_data = list(np.arange(50.0, 150.0, 0.1))
y_eval_data = [toy_function(x) for x in x_eval_data]
LOADER_KWARGS = {'num_workers': 0, 'pin_memory': False} if torch.cuda.is_available() else {}
train_set = TensorDataset(torch.Tensor(x_train_data),torch.Tensor(y_train_data))
eval_set = TensorDataset(torch.Tensor(x_eval_data), torch.Tensor(y_eval_data))
train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, **LOADER_KWARGS)
eval_loader = torch.utils.data.DataLoader(eval_set, batch_size=EVAL_BATCH_SIZE, shuffle=False, **LOADER_KWARGS)
TRAIN_SIZE = len(train_loader.dataset)
EVAL_SIZE = len(eval_loader.dataset)
assert (TRAIN_SIZE % BATCH_SIZE) == 0
assert (EVAL_SIZE % EVAL_BATCH_SIZE) == 0
net = ReferenceRegNet().to(DEVICE)
optimizer = optim.Adam(net.parameters(), lr=1e-3)
losses = {}
# train network
for epoch in range(1,TRAIN_EPOCHS+1):
net.train()
mean_epoch_loss = 0
mean_epoch_mse = 0
# train batches
for batch_idx, (data, target) in enumerate(tqdm(train_loader), start=1):
data, target = (data.to(DEVICE)).unsqueeze(dim=1), (target.to(DEVICE)).unsqueeze(dim=1)
optimizer.zero_grad()
output_means, output_log_var = net(data)
target_np = target.detach().cpu().numpy()
output_means_np = output_means.detach().cpu().numpy()
loss = loss_function(output_means, output_log_var, target)
loss_value = loss.item() # get raw float-value out of loss-tensor
mean_epoch_loss += loss_value
# optimize network
loss.backward()
optimizer.step()
mean_epoch_loss = mean_epoch_loss / len(train_loader)
losses.update({epoch:mean_epoch_loss})
print("Epoch " + str(epoch) + ": Train-Loss = " + str(mean_epoch_loss))
net.eval()
with torch.no_grad():
mean_loss = 0
mean_mse = 0
for data, target in eval_loader:
data, target = (data.to(DEVICE)).unsqueeze(dim=1), (target.to(DEVICE)).unsqueeze(dim=1)
output_means, output_log_var = net(data) # perform prediction
target_np = target.detach().cpu().numpy()
output_means_np = output_means.detach().cpu().numpy()
mean_loss += loss_function(output_means, output_log_var, target).item()
mean_loss = mean_loss/len(eval_loader)
#print("Epoch " + str(epoch) + ": Eval-loss = " + str(mean_loss))
fig = plt.figure(figsize=(40,12)) # create a 30x30 inch figure
ax = fig.add_subplot(1,3,1)
ax.set_title("regression value")
ax.set_xlabel("x")
ax.set_ylabel("regression mean")
ax.plot(x_train_data, y_train_data, 'x', color='black')
ax.plot(x_eval_data, y_eval_data, color='red')
pred_means_list = []
output_vars_list_train = []
output_vars_list_test = []
for x_test in sorted(x_train_data):
x_test = (torch.Tensor([x_test]).to(DEVICE))
pred_means, output_log_vars = net.forward(x_test)
pred_means_list.append(pred_means.detach().cpu())
output_vars_list_train.append(torch.exp(output_log_vars).detach().cpu())
ax.plot(sorted(x_train_data), pred_means_list, color='blue', label = 'training_perform')
pred_means_list = []
for x_test in x_eval_data:
x_test = (torch.Tensor([x_test]).to(DEVICE))
pred_means, output_log_vars = net.forward(x_test)
pred_means_list.append(pred_means.detach().cpu())
output_vars_list_test.append(torch.exp(output_log_vars).detach().cpu())
ax.plot(sorted(x_eval_data), pred_means_list, color='green', label = 'eval_perform')
plt.tight_layout()
plt.legend()
ax = fig.add_subplot(1,3,2)
ax.set_title("variance")
ax.set_xlabel("x")
ax.set_ylabel("regression var")
ax.plot(sorted(x_train_data), output_vars_list_train, label = 'training data')
ax.plot(x_eval_data, output_vars_list_test, label = 'test data')
plt.tight_layout()
plt.legend()
ax = fig.add_subplot(1,3,3)
ax.set_title("training loss")
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss")
lists = sorted(losses.items())
epoch, loss = zip(*lists)
ax.plot(epoch, loss, label = 'loss')
plt.tight_layout()
plt.legend()
plt.savefig('ref_test.png')
Run Code Online (Sandbox Code Playgroud)
TLDR:无论标称损失值是多少,优化都会使梯度变为零时的损失降至最低。
\nK.Frank 的全面解释:
\n\n\n较小的损失 \xe2\x80\x93 代数上较正或代数上较\n负 \xe2\x80\x93 意味着(或应该意味着)更好的预测。优化步骤使用某种版本的梯度下降来减小损失。就优化而言,损失的总体水平\xe2\x80\x99并不重要。梯度告诉优化器如何更改模型参数以减少损失,并且它不关心损失的总体水平。
\n
来自同一来源的示例:
\n\n\n例如,考虑使用 lossA = MSELoss 进行优化。现在\n想象一下用lossB = lossA - 17.2 进行优化。17.2 并没有真正改变任何东西。确实,\xe2\x80\x9cperfect\xe2\x80\x9d 预测\n将产生 lossB = -17.2 而不是零。(当然,对于 \xe2\x80\x9cperfect\xe2\x80\x9d 预测,lossA 将为零。)但是谁在乎呢?
\n
在你的例子中:你是对的,负损失值来自对数项。这是完全可以的,这意味着您的训练主要由高置信度损失项的贡献主导。关于方差的高值 - 无法对此发表太多评论,但应该没问题,因为损失曲线按预期下降。
\n