BERT 编码层在评估期间为所有输入生成相同的输出 (PyTorch)

use*_*146 1 python nlp pre-trained-model pytorch bert-language-model

我不明白为什么我的 BERT 模型在评估期间返回相同的输出。我的模型在训练期间的输出似乎是正确的,因为值不同,但在评估期间却完全相同。 评估期间的输出

这是我的 BERT 模型类

class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 4)

    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids) # Use one of the outputs
        bo = self.bert_drop(o2)
        return self.out(bo)
Run Code Online (Sandbox Code Playgroud)

我的数据集类

class BERTDataset:
    def __init__(self, review, target, tokenizer, classes=4):
        self.review = review
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.classes = classes

    def __len__(self):
        return len(self.review)

    def __getitem__(self, item):
        review = str(self.review)
        review = " ".join(review.split())

        inputs = self.tokenizer.encode_plus(review, None, add_special_tokens=True, max_length= self.max_len,
                                            pad_to_max_length=True, return_token_type_ids=True,
                                            return_attention_masks=True)

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(to_categorical(self.target[item], self.classes), dtype=torch.float)
        }
Run Code Online (Sandbox Code Playgroud)

我的评价函数

def eval_fn(data_loader, model, device):
    model.eval()

    total_loss = 0.0

    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d['ids']
            token_type_ids = d['token_type_ids']
            mask = d['mask']
            targets = d['targets']

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )

            loss = loss_fn(outputs, targets)
            total_loss += loss.item()
Run Code Online (Sandbox Code Playgroud)

还有我的训练函数

def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()

    total_loss = 0.0

    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d['ids']
        token_type_ids = d['token_type_ids']
        mask = d['mask']
        targets = d['targets']

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()

        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        loss = loss_fn(outputs, targets)
        total_loss += loss.item()
        loss.backward()

        optimizer.step()
        scheduler.step()

    return total_loss/len(data_loader)
Run Code Online (Sandbox Code Playgroud)

谢谢!

小智 7

如果其他人遇到问题,也许您忘记使用官方论文中推荐的学习率之一: 5e-5,,,3e-52e-5

如果学习率太高,梯度似乎会极化,例如0.01,导致 val 集重复出现相同的 logits。