PyTorch Lightning trainer.fit 卡在 epoch 0

Luc*_*toe 5 python machine-learning conv-neural-network pytorch pytorch-lightning

我试图使用 PyTorch 和 PyTorch Lightning 制作多输入模型,但我不明白为什么训练器卡在 epoch 0。我试图将此代码从 TensorFlow 迁移到 PyTorch,但 PyTorch 学习曲线是有点陡,我不知道从这里该去哪里。

RC_train_config = config.init_dataset_config(
'RC',
'GI4E',
'label',
16,
lr = 0.001,
epochs = 500,
train_ratio = 0.8
Run Code Online (Sandbox Code Playgroud)

模型的配置,包括超参数和使用的数据集。它也用于数据选择,因为不同的数据集需要不同的处理方法。

class RCDataset(Dataset):
def __init__(self, config_dataset):
    super().__init__()
    self.config_dataset = config_dataset
    
    # Image-handling
    if self.config_dataset['dataset'] == 'all':
        pass
    elif self.config_dataset['dataset'] == 'BIOID':
        if self.config_dataset['mode'] == 'label':
            pass
        elif self.config_dataset['mode'] == 'filter':
            pass
    elif self.config_dataset['dataset'] == 'GI4E':
        if self.config_dataset['mode'] == 'label':
            image1_noteye_paths = glob(C.WORKING_DATASETS['GI4E']['images_label'] + '/0/noteye/*')
            image1_eye_paths = glob(C.WORKING_DATASETS['GI4E']['images_label'] + '/0/left/*')
            image1_eye_paths += glob(C.WORKING_DATASETS['GI4E']['images_label'] + '/0/right/*')
            
            image2_noteye_paths = glob(C.WORKING_DATASETS['GI4E']['images_label'] + '/1/noteye/*')
            image2_eye_paths = glob(C.WORKING_DATASETS['GI4E']['images_label'] + '/1/left/*')
            image2_eye_paths += glob(C.WORKING_DATASETS['GI4E']['images_label'] + '/1/right/*')
            
            image3_noteye_paths = glob(C.WORKING_DATASETS['GI4E']['images_label'] + '/2/noteye/*')
            image3_eye_paths = glob(C.WORKING_DATASETS['GI4E']['images_label'] + '/2/left/*')
            image3_eye_paths += glob(C.WORKING_DATASETS['GI4E']['images_label'] + '/2/right/*')
            
            count_eye = len(glob(C.WORKING_DATASETS['GI4E']['images_label'] + '/2/left/*'))
        elif self.config_dataset['mode'] == 'filter':
            image1_noteye_paths = glob(C.WORKING_DATASETS['GI4E']['images_filter'] + '/0/noteye/*')
            image1_eye_paths = glob(C.WORKING_DATASETS['GI4E']['images_filter'] + '/0/left/*')
            image1_eye_paths += glob(C.WORKING_DATASETS['GI4E']['images_filter'] + '/0/right/*')
            
            image2_noteye_paths = glob(C.WORKING_DATASETS['GI4E']['images_filter'] + '/1/noteye/*')
            image2_eye_paths = glob(C.WORKING_DATASETS['GI4E']['images_filter'] + '/1/left/*')
            image2_eye_paths += glob(C.WORKING_DATASETS['GI4E']['images_filter'] + '/1/right/*')
            
            image3_noteye_paths = glob(C.WORKING_DATASETS['GI4E']['images_filter'] + '/2/noteye/*')
            image3_eye_paths = glob(C.WORKING_DATASETS['GI4E']['images_filter'] + '/2/left/*')
            image3_eye_paths += glob(C.WORKING_DATASETS['GI4E']['images_filter'] + '/2/right/*')
            
            count_eye = len(glob(C.WORKING_DATASETS['GI4E']['images_filter'] + '/2/left/*'))
            
    self.image1_paths = image1_noteye_paths + image1_eye_paths
    self.image2_paths = image2_noteye_paths + image2_eye_paths
    self.image3_paths = image3_noteye_paths + image3_eye_paths
    
    # Label-handling
    label_noteye1 = torch.zeros(len(image1_noteye_paths))
    label_noteye2 = torch.zeros(len(image2_noteye_paths))
    label_noteye3 = torch.zeros(len(image3_noteye_paths))
    
    label_left = torch.ones(count_eye)
    label_right = torch.full([count_eye], 2)
    
    self.labels1 = torch.concat((label_noteye1, label_left, label_right))
    self.labels2 = torch.concat((label_noteye2, label_left, label_right))
    self.labels3 = torch.concat((label_noteye3, label_left, label_right))
           
def __getitem__(self, idx):
    image1 = None
    image2 = None
    image3 = None
    
    with Image.open(self.image1_paths[idx]) as img:
        image1(img)
        
    with Image.open(self.image2_paths[idx]) as img:
        image2(img)
        
    with Image.open(self.image3_paths[idx]) as img:
        image3(img)
        
    image1 = transforms.ToTensor()
    image2 = transforms.ToTensor()
    image3 = transforms.ToTensor()
            
    return image1, image2, image3, self.labels1[idx], self.labels2[idx], self.labels3[idx]

def __len__(self):
    return len(self.image1_paths)
Run Code Online (Sandbox Code Playgroud)

PyTorch 基础数据集

class RCDataModule(pl.LightningDataModule):
def __init__(self, config_dataset: dict, mode: str):
    super().__init__()
    dataset = RCDataset(config_dataset)
    
    self.config_dataset = config_dataset
    
    self.data_train = []
    self.data_val = []
    self.data_test = []
    
    if mode == 'train':
        self.data_train, self.data_val = random_split(
            dataset,
            [round(len(dataset) * config_dataset['train_ratio']),
             round(len(dataset) * (config_dataset['testval_ratio'] * 2))]
        )
    elif mode == 'predict':
        self.data_test = dataset
    
def train_dataloader(self):
    return DataLoader(
        self.data_train,
        batch_size=self.config_dataset['batch_size'],
        num_workers=12
    )

def val_dataloader(self):
    return DataLoader(
        self.data_val,
        batch_size=self.config_dataset['batch_size'],
        num_workers=12
    )

def predict_dataloader(self):
    return DataLoader(
        self.data_test,
        batch_size=self.config_dataset['batch_size'],
        num_workers=12
    )
Run Code Online (Sandbox Code Playgroud)

闪电包装纸

class RCBase(nn.Module):
def __init__(self):
    super().__init__()
    #RC1
    self.RC1 = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=5, stride=2, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Flatten(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, 3),
        nn.Softmax(1)
    )
    
    #RC2
    self.RC2 = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=5, stride=2, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Flatten(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, 3),
        nn.Softmax(1)
    )
    
    #RC3
    self.RC3 = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=5, stride=2, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Flatten(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, 3),
        nn.Softmax(1)
    )
    
def forward(self, img1, img2, img3):
    output1 = self.RC1(img1)
    output2 = self.RC2(img2)
    output3 = self.RC3(img3)
    
    return output1, output2, output
Run Code Online (Sandbox Code Playgroud)

基础 PyTorch 模型

class RCPL(pl.LightningModule):
def __init__(self, config_dataset: dict):
    super().__init__()
    self.RC_base = RCBase()
    self.config_dataset = config_dataset

def forward(self, img1, img2, img3):
    output1, output2, output3 = self.RC_base(img1, img2, img3)
    
    return output1, output2, output3

def configure_optimizers(self):
    return optim.Adam(self.parameters(), lr=self.config_dataset['lr'])
    
def training_step(self, batch, batch_idx):
    img1, img2, img3, lbl1, lbl2, lbl3 = batch
    output1, output2, output3 = self.RC_base(img1, img2, img3)
    
    train_loss1 = F.cross_entropy(output1, lbl1)
    train_loss2 = F.cross_entropy(output2, lbl2)
    train_loss3 = F.cross_entropy(output3, lbl3)
    
    avg_loss = (train_loss1 + train_loss2 + train_loss3) / 3
    
    self.log('avg_train_loss', avg_loss, on_epoch = True)
    self.log('train_loss1', train_loss1, on_epoch = True)
    self.log('train_loss2', train_loss2, on_epoch = True)
    self.log('train_loss3', train_loss3, on_epoch = True)
    
    return avg_loss, train_loss1, train_loss2, train_loss3

def validation_step(self, batch, batch_idx):
    img1, img2, img3, lbl1, lbl2, lbl3 = batch
    output1, output2, output3 = self.RC_base(img1, img2, img3)
    
    val_loss1 = F.cross_entropy(output1, lbl1)
    val_loss2 = F.cross_entropy(output2, lbl2)
    val_loss3 = F.cross_entropy(output3, lbl3)
    
    avg_val_loss = (val_loss1 + val_loss2 + val_loss3) / 3
    
    self.log('avg_val_loss', avg_val_loss, on_epoch = True)
    self.log('val_loss1', val_loss1, on_epoch = True)
    self.log('val_loss2', val_loss2, on_epoch = True)
    self.log('val_loss3', val_loss3, on_epoch = True)

def predict_step(self, batch, batch_idx):
    img1, img2, img3, lbl1, lbl2, lbl3 = batch
    
    return self(img1, img2, img3)
Run Code Online (Sandbox Code Playgroud)

PyTorch 模型包装器。

从 TensorFlow 到 PyTorch 有点困难,因为在 TensorFlow 中使用一切都更加自动化和直观。这里没有产生错误输出,所以我不确定这有什么问题。

输出(为了便于阅读而简化):

LOCAL_RANK:0 - CUDA_VISIBLE_DEVICES:[0]

  • 239 K 可训练参数
  • 0 不可训练参数
  • 239 K 总参数
  • 0.958 估计模型参数总大小 (MB)

纪元 0: 0%| | 0/782 [00:00<?, ?it/s]