War*_*ice 11 nvidia machine-learning deep-learning conv-neural-network pytorch
我正在尝试在 pytorch 中训练 CNN,但遇到了一些问题。运行时错误:
运行时错误:CUDA 内存不足。尝试分配 512.00 MiB(GPU 0;2.00 GiB 总容量;已分配 584.97 MiB;13.81 MiB 空闲;PyTorch 总共保留 590.00 MiB)
这是我的代码:
import os
import numpy as np
import cv2
import torch as t
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader,Dataset
import time
import matplotlib.pyplot as plt
%matplotlib inline
root_path='C:/Users/60960/Desktop/recet-task/course_LeeML20/course_LeeML20-datasets/hw3/food-11'
training_path=root_path+'/training'
testing_path=root_path+'/testing'
validation_path=root_path+'/validation'
def readfile(path,has_label):
img_paths=sorted(os.listdir(path))
x=np.zeros((len(img_paths),128,128,3),dtype=np.uint8)
y=np.zeros((len(img_paths)),dtype=np.uint8)
for i,file in enumerate(img_paths):
img=cv2.imread(path+'/'+file)
x[i,:,:]=cv2.resize(img,(128,128))
if has_label:
y[i]=int(file.split('_')[0])
if has_label:
return x,y
else:
return x
def show_img(img_from_cv2):
b,g,r=cv2.split(img_from_cv2)
img=cv2.merge([r,g,b])
plt.imshow(img)
plt.show()
x_train,y_train=readfile(training_path,True)
x_val,y_val=readfile(validation_path,True)
x_test=readfile(testing_path,False)
train_transform=transforms.Compose([
transforms.ToPILImage(),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ToTensor()
])
test_transform=transforms.Compose([
transforms.ToPILImage(),
transforms.ToTensor()
])
class ImgDataset(Dataset):
def __init__(self,x,y=None,transform=None):
self.x=x
self.y=y
if y is not None:
self.y=t.LongTensor(y)
self.transform=transform
def __len__(self):
return len(self.x)
def __getitem__(self,idx):
X=self.x[idx]
if self.transform is not None:
X=self.transform(X)
if self.y is not None:
Y=self.y[idx]
return X,Y
return X
batch_size=128
train_set=ImgDataset(x_train,y_train,transform=train_transform)
val_set=ImgDataset(x_val,y_val,transform=test_transform)
train_loader=DataLoader(train_set,batch_size=batch_size,shuffle=True)
val_loader=DataLoader(val_set,batch_size=batch_size,shuffle=False)
class Classifier(nn.Module):
def __init__(self):
super(Classifier,self).__init__()
self.cnn=nn.Sequential(
nn.Conv2d(3,64,3,1,1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2,2,0),
nn.Conv2d(64,128,3,1,1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(2,2,0),
nn.Conv2d(128,256,3,1,1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(2,2,0),
nn.Conv2d(256,512,3,1,1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.MaxPool2d(2,2,0),
nn.Conv2d(512,512,3,1,1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.MaxPool2d(2,2,0)
)
self.fc=nn.Sequential(
nn.Linear(512*4*4,1024),
nn.ReLU(),
nn.Linear(1024,512),
nn.ReLU(),
nn.Linear(512,11)
)
def forward(self,x):
out=self.cnn(x)
out=out.view(out.size()[0],-1)
return self.fc(out)
model=Classifier().cuda()
loss_fn=nn.CrossEntropyLoss()
optim=t.optim.Adam(model.parameters(),lr=0.001)
epochs=30
for epoch in range(epochs):
epoch_start_time=time.time()
train_acc=0.0
train_loss=0.0
val_acc=0.0
val_loss=0.0
model.train()
for i,data in enumerate(train_loader):
optim.zero_grad()
train_pred=model(data[0].cuda())
batch_loss=loss_fn(train_pred,data[1].cuda())
batch_loss.backward()
optim.step()
train_acc+=np.sum(np.argmax(train_pred.cpu().data.numpy(),axis=1)==data[1].numpy())
train_loss+=batch_loss.item()
model.eval()
with t.no_grad():
for i,data in enumerate(val_loader):
val_pred=model(data[0].cuda())
batch_loss=loss_fn(val_pred,data[1].cuda())
val_acc+=np.sum(np.argmax(val_pred.cpu().data.numpy(),axis=1)==data[1].numpy())
val_loss+=batch_loss.item()
print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % (epoch + 1, epochs, time.time()-epoch_start_time,train_acc/train_set.__len__(), train_loss/train_set.__len__(), val_acc/val_set.__len__(), val_loss/val_set.__len__()))
x_train_val=np.concatenate((x_train,x_val),axis=0)
y_train_val=np.concatenate((y_train,y_val),axis=0)
train_val_set=ImgDataset(x_train_val,x_train_val,train_transform)
train_val_loader=DataLoader(train_val_set,batch_size=batch_size,shuffle=True)
model_final=Classifier().cuda()
loss_fn=nn.CrossEntropy()
optim=t.optim.Adam(model_final.parameters(),lr=0.001)
epochs=30
for epoch in range(epochs):
epoch_start_time=time.time()
train_acc=0.0
train_loss=0.0
model_final.train()
for i,data in enumerate(train_val_loader):
optim.zero_grad()
train_pred=model_final(data[0].cuda())
batch_loss=loss_fn(train_pred,data[1].cuda())
batch_loss.backward()
optim.step()
train_acc+=np.sum(np.argmax(train_pred.cpu().data.numpy(),axis=1)==data[1].numpy())
train_loss+=batch_loss.item()
print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f' % (epoch + 1, epochs, time.time()-epoch_start_time,train_acc/train_val_set.__len__(), train_loss/train_val_set.__len__()))
test_set=ImgDataset(x_test,transform=test_transform)
test_loader=DataLoader(test_set,batch_size=batch_size,shuffle=False)
model_final.eval()
prediction=[]
with t.no_grad():
for i,data in enumerate(test_loader):
test_pred=model_final(data.cuda())
test_label=np.argmax(test_pred.cpu().data.numpy(),axis=1)
for y in test_label:
prediction.append(y)
with open('predict.csv','w') as f:
f.write('Id,Category\n')
for i,y in enumerate(prediction):
f.write('{},{}\n,'.format(i,y))
Run Code Online (Sandbox Code Playgroud)
Pytorch版本是1.4.0,opencv2版本是4.2.0。
训练数据集是这样的图片:训练集
错误发生在这一行:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-1-770be67177f4> in <module>
119 for i,data in enumerate(train_loader):
120 optim.zero_grad()
--> 121 train_pred=model(data[0].cuda())
122 batch_loss=loss_fn(train_pred,data[1].cuda())
123 batch_loss.backward()
Run Code Online (Sandbox Code Playgroud)
我已经安装了:
一些信息。
GPU利用率低,接近于零:
GPU利用率。
错误消息说:
运行时错误:CUDA 内存不足。尝试分配 512.00 MiB。
所以我想知道如何分配更多内存。
更重要的是,我尝试过将批量大小减少到1,但这不起作用。
帮助!!!
小智 5
在减少批量大小之前检查 GPU 内存的状态:slight_smile:
英伟达-SMI
然后检查哪个进程正在占用内存,选择 PID 并杀死 :boom: 该进程
sudo Kill -9 PID
或者
须藤定影 -v /dev/nvidia*
sudo Kill -9 PID
小智 4
尝试减少你的batch_size(例如32)。发生这种情况的原因是您的 GPU 内存无法容纳单个 epoch 的所有图像。
| 归档时间: |
|
| 查看次数: |
60309 次 |
| 最近记录: |