jc7*_*711 9 neural-network deep-learning pytorch
我正在尝试为自定义架构重用一些 resnet 层,但遇到了一个我无法弄清楚的问题。这是一个简化的例子;当我运行时:
import torch
from torchvision import models
from torchsummary import summary
def convrelu(in_channels, out_channels, kernel, padding):
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel, padding=padding),
nn.ReLU(inplace=True),
)
class ResNetUNet(nn.Module):
def __init__(self):
super().__init__()
self.base_model = models.resnet18(pretrained=False)
self.base_layers = list(self.base_model.children())
self.layer0 = nn.Sequential(*self.base_layers[:3])
def forward(self, x):
print(x.shape)
output = self.layer0(x)
return output
base_model = ResNetUNet().cuda()
summary(base_model,(3,224,224))
Run Code Online (Sandbox Code Playgroud)
正在给我:
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 64, 112, 112] 9,408
Conv2d-2 [-1, 64, 112, 112] 9,408
BatchNorm2d-3 [-1, 64, 112, 112] 128
BatchNorm2d-4 [-1, 64, 112, 112] 128
ReLU-5 [-1, 64, 112, 112] 0
ReLU-6 [-1, 64, 112, 112] 0
================================================================
Total params: 19,072
Trainable params: 19,072
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 36.75
Params size (MB): 0.07
Estimated Total Size (MB): 37.40
----------------------------------------------------------------
Run Code Online (Sandbox Code Playgroud)
这是复制每一层(有 2 个 convs、2 个 batchnorms、2 个 relu),而不是每个层。如果我打印出来,self.base_layers[:3]我会得到:
[Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False), BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), ReLU(inplace=True)]
它只显示三层,没有重复。为什么要复制我的图层?
我正在使用 pytorch 版本 1.4.0
您的图层实际上并未被调用两次。这是如何summary实现的工件。
原因很简单,因为summary递归地遍历模块的所有子项并为每个子项注册前向钩子。由于您有重复的子项(inbase_model和layer0),那么这些重复的模块会注册多个钩子。当汇总调用转发时,这会导致每个模块的两个钩子都被调用,从而导致报告层的重复。
对于您的玩具示例,解决方案是简单地不将其分配base_model为属性,因为无论如何它都不会在转发过程中使用。这避免base_model了被添加为孩子。
class ResNetUNet(nn.Module):
def __init__(self):
super().__init__()
base_model = models.resnet18(pretrained=False)
base_layers = list(base_model.children())
self.layer0 = nn.Sequential(*base_layers[:3])
Run Code Online (Sandbox Code Playgroud)
另一种解决方案是创建一个修改版本,summary它不会多次为同一模块注册钩子。下面是一个增强的summary地方,我使用一个名为的集合already_registered来跟踪已经注册了钩子的模块,以避免注册多个钩子。
from collections import OrderedDict
import torch
import torch.nn as nn
import numpy as np
def summary(model, input_size, batch_size=-1, device="cuda"):
# keep track of registered modules so that we don't add multiple hooks
already_registered = set()
def register_hook(module):
def hook(module, input, output):
class_name = str(module.__class__).split(".")[-1].split("'")[0]
module_idx = len(summary)
m_key = "%s-%i" % (class_name, module_idx + 1)
summary[m_key] = OrderedDict()
summary[m_key]["input_shape"] = list(input[0].size())
summary[m_key]["input_shape"][0] = batch_size
if isinstance(output, (list, tuple)):
summary[m_key]["output_shape"] = [
[-1] + list(o.size())[1:] for o in output
]
else:
summary[m_key]["output_shape"] = list(output.size())
summary[m_key]["output_shape"][0] = batch_size
params = 0
if hasattr(module, "weight") and hasattr(module.weight, "size"):
params += torch.prod(torch.LongTensor(list(module.weight.size())))
summary[m_key]["trainable"] = module.weight.requires_grad
if hasattr(module, "bias") and hasattr(module.bias, "size"):
params += torch.prod(torch.LongTensor(list(module.bias.size())))
summary[m_key]["nb_params"] = params
if (
not isinstance(module, nn.Sequential)
and not isinstance(module, nn.ModuleList)
and not (module == model)
and module not in already_registered:
):
already_registered.add(module)
hooks.append(module.register_forward_hook(hook))
device = device.lower()
assert device in [
"cuda",
"cpu",
], "Input device is not valid, please specify 'cuda' or 'cpu'"
if device == "cuda" and torch.cuda.is_available():
dtype = torch.cuda.FloatTensor
else:
dtype = torch.FloatTensor
# multiple inputs to the network
if isinstance(input_size, tuple):
input_size = [input_size]
# batch_size of 2 for batchnorm
x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]
# print(type(x[0]))
# create properties
summary = OrderedDict()
hooks = []
# register hook
model.apply(register_hook)
# make a forward pass
# print(x.shape)
model(*x)
# remove these hooks
for h in hooks:
h.remove()
print("----------------------------------------------------------------")
line_new = "{:>20} {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #")
print(line_new)
print("================================================================")
total_params = 0
total_output = 0
trainable_params = 0
for layer in summary:
# input_shape, output_shape, trainable, nb_params
line_new = "{:>20} {:>25} {:>15}".format(
layer,
str(summary[layer]["output_shape"]),
"{0:,}".format(summary[layer]["nb_params"]),
)
total_params += summary[layer]["nb_params"]
total_output += np.prod(summary[layer]["output_shape"])
if "trainable" in summary[layer]:
if summary[layer]["trainable"] == True:
trainable_params += summary[layer]["nb_params"]
print(line_new)
# assume 4 bytes/number (float on cuda).
total_input_size = abs(np.prod(input_size) * batch_size * 4. / (1024 ** 2.))
total_output_size = abs(2. * total_output * 4. / (1024 ** 2.)) # x2 for gradients
total_params_size = abs(total_params.numpy() * 4. / (1024 ** 2.))
total_size = total_params_size + total_output_size + total_input_size
print("================================================================")
print("Total params: {0:,}".format(total_params))
print("Trainable params: {0:,}".format(trainable_params))
print("Non-trainable params: {0:,}".format(total_params - trainable_params))
print("----------------------------------------------------------------")
print("Input size (MB): %0.2f" % total_input_size)
print("Forward/backward pass size (MB): %0.2f" % total_output_size)
print("Params size (MB): %0.2f" % total_params_size)
print("Estimated Total Size (MB): %0.2f" % total_size)
print("----------------------------------------------------------------")
# return summary
Run Code Online (Sandbox Code Playgroud)
注意我有一个更详细的答案,你可以在编辑中找到,但事后看来,我认为这个更简单的解释更容易理解。