Noonisy
S2-Transformer代码运行解析
2022-08-18
阅读:282

$S^2-Transformer$ 代码运行解析(基于 $M^2$ )


我改成了单 $Gpu$ 运行,忽略 $DDP\ Training$

TODO

  • [✔] 2022-08-18
    • [✔] $Start$
  • [ ] 2022-09-12





运行main

train_trasnformer.py中,定义的多个可选参数,存在args
device = torch.device('cuda')
parser = argparse.ArgumentParser(description="Transformer")
parser.add_argument('--exp_name', type=str, default='s2')
...
args = parser.parse_args()
打印一下
print(args)
'''
annotation_folder='Data/annotations'
batch_size=50
dir_to_save_model='checkpoint/'
exp_name='s2'
features_path='Data/X101_grid_feats_coco_trainval.hdf5'
head=8
logs_folder='tensorboard_logs'
m=40  # 没使用
num_clusters=5
refine_epoch_rl=28
resume_best=False
resume_last=False
rl_base_lr=5e-06
text2text=0  # 没使用
warmup=10000
workers=0
xe_base_lr=0.0001
xe_least=15
xe_most=20
'''
放进Train函数
train(args)
预处理,判断存模型dir_to_save_modeltensorboard_logs的目录是否存在,若不存在,则新建
# preparation
if not os.path.exists(args.dir_to_save_model):
    os.makedirs(args.dir_to_save_model)
if not os.path.exists(args.logs_folder):
    os.makedirs(args.logs_folder)
创建tensorboard,可视化训练过程
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir=os.path.join(args.logs_folder, args.exp_name))
创建image_field,表示图像特征的类
# Pipeline for image regions
image_field = ImageDetectionsField(detections_path=args.features_path, max_detections=49, load_in_tmp=False)
创建text_field,表示文本标注的类
# Pipeline for text
text_field = TextField(init_token='<bos>', eos_token='<eos>', lower=True, tokenize='spacy', remove_punctuation=True, nopoints=False)
创建数据集,这里采用COCO数据集
# Create the dataset
dataset = COCO(image_field, text_field, 'coco/images/', args.annotation_folder, args.annotation_folder)
train_dataset, val_dataset, test_dataset = dataset.splits
创建 vocab.pkl,字典,包含 $10201$ 个字符或字符串
  • 若存在,直接load
  • 若不存在,则以出现次数大于 $5$ 次为条件,新建词表
if not os.path.isfile('vocab.pkl'):
    print("Building vocabulary")
    text_field.build_vocab(train_dataset, val_dataset, min_freq=5)
    pickle.dump(text_field.vocab, open('vocab.pkl', 'wb'))
else:
    print('Loading from vocabulary')
    text_field.vocab = pickle.load(open('vocab.pkl', 'rb'))

建模

创建model

encoder有自定义的参数
  • EncoderLayer为 $3$ 层
  • <pad>这个 $token$ 的 index,为 $0$
  • 注意力模块attention_module为原始的ScaledDotProductAttention
  • attention_module_kwargs中的m=args.m=40并没有在这篇论文中使用,这是在 $M^2$中出现的超参数
decoder中的参数
  • 筛选后语料库词表的长度,为 $10201$
  • 最长的句子长度为 $54$
  • DecoderLayer为 $3$ 层
  • <pad>这个 $token$ 的index,为 $1$
Transformer的参数
  • <bos>index
  • 上面定义的encoder
  • 上面定义的decoder
  • 超参数,聚类的数目,文章中为 $5$
  • vocab的长度
  • max_len为 $54$
  • <pad>index
  • text_dimension,投影的维度为 $512$
# Model and dataloaders
encoder = TransformerEncoder(3, 0, attention_module=ScaledDotProductAttention, attention_module_kwargs={'m': args.m})
decoder = TransformerDecoderLayer(len(text_field.vocab), 54, 3, text_field.vocab.stoi['<pad>']
model = Transformer(text_field.vocab.stoi['<bos>'], encoder, decoder, args.num_clusters, len(text_field.vocab), 54, text_field.vocab.stoi['<pad>'], 512).to(device)
创建datasetdict,在 $SCST$ 训练阶段使用
dict_dataset_train = train_dataset.image_dictionary({'image': image_field, 'text': RawField(), 'add_text': text_field})
dict_dataset_val = val_dataset.image_dictionary({'image': image_field, 'text': RawField(), 'add_text': text_field})
dict_dataset_test = test_dataset.image_dictionary({'image': image_field, 'text': RawField(), 'add_text': text_field})
创建ref_caps_train,是一个list,储存训练时的references$($或者叫 $labels)$,全是 $images$ 对应的 $sentences(str)$
ref_caps_train = list(train_dataset.text())
然后tokenizer.pyclass PTBTokenizer(object)ref_caps_train进行处理

用到了 $Stanford\ corenlp$ 的 $Java\ jar$ 包,return的是一个dict,包含所有的capscaps是一个个的list,每个list包含一串str

把这个dict丢到class Cider()中,产生一个cider_train对象,用于 $SCST$cider指标优化
cider_train = Cider(PTBTokenizer.tokenize(ref_caps_train))
接下来定义了两个训练阶段, $XE$ 和 $SCST$ 的学习率变换方法
def lambda_lr(s):
    print("s:", s)
    if s <= 3:
        lr = args.xe_base_lr * s / 4
    elif s <= 10:
        lr = args.xe_base_lr
    elif s <= 12:
        lr = args.xe_base_lr * 0.2
    else:
        lr = args.xe_base_lr * 0.2 * 0.2
    return lr

def lambda_lr_rl(s):
    refine_epoch = args.refine_epoch_rl
    print("rl_s:", s)
    if s <= refine_epoch:
        lr = args.rl_base_lr
    elif s <= refine_epoch + 3:
        lr = args.rl_base_lr * 0.2
    elif s <= refine_epoch + 6:
        lr = args.rl_base_lr * 0.2 * 0.2
    else:
        lr = args.rl_base_lr * 0.2 * 0.2 * 0.2
    return lr
初始化条件 $Initial\ Conditions$
optim = Adam(model.parameters(), lr=1, betas=(0.9, 0.98))
scheduler = LambdaLR(optim, lambda_lr)

optim_rl = Adam(model.parameters(), lr=1, betas=(0.9, 0.98))
scheduler_rl = LambdaLR(optim_rl, lambda_lr_rl)
# <pad>这个token不参加计算loss
loss_fn = NLLLoss(ignore_index=text_field.vocab.stoi['<pad>'])  
# 第二个loss
loss_align = MSELoss()  
loss = (loss_fn, loss_align)
use_rl = False
best_cider = .0
best_test_cider = 0.
patience = 0
start_epoch = 0
设置断点继续训练
if args.resume_last or args.resume_best:
    if args.resume_last:
        fname = os.path.join(args.dir_to_save_model, '%s_last.pth' % args.exp_name)
    else:
        fname = os.path.join(args.dir_to_save_model, '%s_best.pth' % args.exp_name)

    # fname = 'checkpoint/s2_last.pth'
    if os.path.exists(fname):
        print("load model {}".format(fname))
        data = torch.load(fname)
        torch.set_rng_state(data['torch_rng_state'])
        torch.cuda.set_rng_state(data['cuda_rng_state'])
        np.random.set_state(data['numpy_rng_state'])
        random.setstate(data['random_rng_state'])
        model.load_state_dict(data['state_dict'], strict=False)
        """
        optim.load_state_dict(data['optimizer'])
        scheduler.load_state_dict(data['scheduler'])
        """
        start_epoch = data['epoch'] + 1
        best_cider = data['best_cider']
        best_test_cider = data['best_test_cider']
        patience = data['patience']
        use_rl = data['use_rl']

        if use_rl:
            optim.load_state_dict(data['optimizer'])
            scheduler.load_state_dict(data['scheduler'])
        else:
            optim_rl.load_state_dict(data['optimizer'])
            scheduler_rl.load_state_dict(data['scheduler'])

        print('Resuming from epoch %d, validation loss %f, best cider %f, and best_test_cider %f' % (data['epoch'], data['val_loss'], data['best_cider'], data['best_test_cider']))
        print('patience:', data['patience'])
    else:
        print("no load model")

开始训练

print("Training starts")
for e in range(start_epoch, start_epoch + 100):
    ...
创建dataloader,用在 $XE$ 训练阶段
dataloader_train = DataLoader(train_dataset, batch_size=args.batch_size, pin_memory=True, drop_last=False, num_workers=args.workers, shuffle=True, persistent_workers=True)
dataloader_val = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers)
创建dict_dataloader,用在 $SCST$ 训练阶段
dict_dataloader_train = DataLoader(dict_dataset_train, batch_size=args.batch_size // 5,  pin_memory=True, drop_last=False, num_workers=args.workers, persistent_workers=True)
dict_dataloader_val = DataLoader(dict_dataset_val, batch_size=args.batch_size // 5)
dict_dataloader_test = DataLoader(dict_dataset_test, batch_size=args.batch_size // 5)
判断当前训练阶段,两个阶段的loss function并不相同,并记录不同 $epoch$ 的 $logs$
if not use_rl:
    train_loss = train_xe(model, dataloader_train, optim, text_field)
    writer.add_scalar('data/train_loss', train_loss, e)
else:
    train_loss, reward, reward_baseline = train_scst(model, dict_dataloader_train, optim, cider_train, text_field)
    writer.add_scalar('data/train_loss', train_loss, e)
    writer.add_scalar('data/reward', reward, e)
    writer.add_scalar('data/reward_baseline', reward_baseline, e)
在 $XE$ 训练阶段,使用 $Cross\ Entropy\ Loss$ = $NLLLoss$ + $Log\_Softmax$

model、dataloader、optim等等,输入到训练函数中,求得一次 $epoch$ 迭代更新后的训练 $loss$

首先遍历dataloader,得到iterationdetections(就是key)captions(就是value)detections的形状为(bs, max_detections, visual_dim)

detections作为imagescaptions作为sequences,输入给模型model;将梯度置 $0$;从第二个单词开始截取captions,也就是对应captions[:, 1:]captions(batch_size, seq_len)的形状,保证截取的部分数据是一个连续内存,否则新开辟一块内存存储数据,将截取数据命名为captions_gt,也就是ground\ truth,它的形状是(batch_size, seq_len-1)。因为要预测captions,所以正好是错位的,在sequence中,上一个token预测下一个token

out是模型的输出,为(batch_size, seq_len, vocab_len)的形状,然后截断seq_len最后一个token,因为要与captions_gt计算 $loss$,所以要截断一个token,保存为(batch_size, seq_len-1, vocab_len)

out变换为(batch_size * (seq_len-1), vocab_len),把captions_gt变换为(batch_size * (seq_len-1)),输入到loss_function,这里的损失函数是NLLLoss(),计算 $loss$,然后反向传播更新参数
  • out = (bs * (seq_len-1), vocab_len)
  • captions_gt = (bs * (seq_len-1))
captions_gt视为 $labels$,是一维的 $tensor$;out是最后一层decoder的输出经过log_softmax的结果,经过softmax转为 $0\sim1$ 的正数,再取log后变为 $-\infty\sim0$ 的负数

pbar在每次计算后,动态显示进度条
def train_xe(model, dataloader, optim, text_field,  scheduler, loss_fn, e):
    # Training with cross-entropy
    model.train()
    scheduler.step()
    # show learning late
    print('lr = ', optim.state_dict()['param_groups'][0]['lr'])  
    running_loss = .0
    with tqdm(desc='Epoch %d - train' % e, unit='it', total=len(dataloader)) as pbar:
        for it, (detections, captions) in enumerate(dataloader):
            detections, captions = detections.to(device), captions.to(device)
            out = model(mode='xe', images=detections, seq=captions)
            optim.zero_grad()
            captions_gt = captions[:, 1:].contiguous()
            out = out[:, :-1].contiguous()

            loss = loss_fn(out.view(-1, len(text_field.vocab)), captions_gt.view(-1))
            loss.backward()
            optim.step()
            
            this_loss = loss.item()
            running_loss += this_loss

            pbar.set_postfix(loss=running_loss / (it + 1))
            pbar.update()

    loss = running_loss / len(dataloader)
    return loss

XE训练阶段

对于XE交叉熵训练,将images喂给encoder(TransformerEncoder),首先super().__init__,初始化各种属性,例如self.d_model、self.SR等,然后进入forward()函数

可以把该阶段的这些步骤分为三个部分
  • 编码网格特征
  • 编码增强的网格特征
  • 解码总的特征
if mode == 'xe':
    # images = (batch_size, max_detections, dim_visual)
    bs, _, vis_dim = images.size()
    # Grid feature 编码网格特征
    grid_enc_output, grid_mask_enc = self.encoder(images)

    # Pseudo-region feature 编码增强的网格特征
    # (N, num_clusters*2048) -> (N, num_clusters, 2048)
    pseudo_region = self.SP(images).view(bs, -1, vis_dim)  
    pseudo_region_enc_output, pseudo_region_mask_enc = self.encoder(pseudo_region)

    output, mask = torch.cat([grid_enc_output, pseudo_region_enc_output], dim=1), torch.cat([grid_mask_enc, pseudo_region_mask_enc], dim=-1)
    # 解码总的特征
    dec_output = self.decoder(seq, output, mask)

    return dec_output
编码网格特征
TransformerEncoder.forward()函数对input也就是images处理,包括mask、fc、dropout等,然后通过super().forward()调用父类MultiLevelEncoder.forward()方法
def forward(self, input, attention_weights=None):
    mask = (torch.sum(input, dim=-1) == 0).unsqueeze(-1)
    out = F.relu(self.fc(input))
    out = self.dropout(out)
    out = self.layer_norm(out)
    out = out.masked_fill(mask, 0)
    # out (bs, max_dections, d_model)
    return super(TransformerEncoder, self).forward(out, attention_weights=attention_weights)
下面是父类MultiLevelEncoderforward()方法,得到attention_mask,喂给 $Scale-aware\ \ Reinforcement(SR)$ 模块
def forward(self, input, attention_weights=None):
    # input = (bs, max_detections, d_model)
    # attention_mask = (bs, 1, 1, max_detections)
    attention_mask = (torch.sum(input, -1) == self.padding_idx).unsqueeze(1).unsqueeze(1)  
    out = self.SR(input, self.layers, attention_mask, attention_weights)
    return out, attention_mask
在 $SR$ 模块的forward()函数进行处理,利用注意力机制提取输入的x的特征,提取特征的语义信息。并编码

SR 特征权重融合
def forward(self, x, layers, attention_mask = None, attention_weights = None):
    out = x
    outs = []
    for l in layers:
        out = l(out, out, out, attention_mask, attention_weights)
        outs.append(out)
    outs = self.MLP(torch.cat(outs, -1))
    out = 0.2 * outs + out
    return out
out、attention_mask等等输入到encoder层,共有 $3$ 层,每层的输入输出都相同;进入到EncoderLayer.forward()方法

查考了论文 Attention Is All You Need,这里就是encoder层前向传播的实现
def forward(self, queries, keys, values, attention_mask=None, attention_weights=None):
    att = self.mhatt(queries, keys, values, attention_mask, attention_weights)
    att = self.lnorm(queries + self.dropout(att))
    ff = self.pwff(att)
    return ff
其中的layers每层的结构如下
(0): EncoderLayer(
    (mhatt): MultiHeadAttention(
        (attention): ScaledDotProductAttention(
            (fc_q): Linear(in_features=512, out_features=512, bias=True)
            (fc_k): Linear(in_features=512, out_features=512, bias=True)
            (fc_v): Linear(in_features=512, out_features=512, bias=True)
            (fc_o): Linear(in_features=512, out_features=512, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (dropout): Dropout(p=0.1, inplace=False)
    (lnorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (pwff): PositionWiseFeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (dropout_2): Dropout(p=0.1, inplace=False)
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
)
将每一层编码器的输出都concat在一起,然后通过MLP衡量各层特征的 $Contributions$ 后得到outs,这个outs包含了更 $low\ level$ 的语义信息,给它赋予权重后加在原特征上面,得到增强的网格编码特征
  • 输出的out = (bs, max_detections, d_model)
  • attention_mask = (bs, 1, 1, max_detections)
最终self.encoder返回outattentions_mask,命名为grid_enc_outputgrid_mask_enc

编码网格特征
编码增强的网格特征
接下来,将原先的images作为网格特征输入给 $Spatial-aware\ \ Pseudo-supervised(SP)$ 模块,进行聚类,得到作者提出的 $Pseudo-region\ Features$

首先进入SP模块的forward()方法,获取到网格特征的batch_sizevisual_dim(就是C_in),原本的网格特征grid的形状为(batch_size, max_detection_region=49, visual_dim),然后将grid变形为(batch_size, visual_dim, 7, 7)

对特征grid进行 $l2$ 归一化,然后做卷积运算Conv2d
  • (bs, C_in=visual_dim=2048, h=7, w=7) -> (bs, C_out=num_clusters=5, h=7, w=7)
接着继续变换形状
  • (bs, num_clusters=5, h=7, w=7) -> (bs, num_clusters, 49)
然后做一个Softmax,得到soft_assign

x_flatten=(bs, visual_dim, -1)=(bs, visual_dim, 49),再expand(num_clusters, -1, -1, -1),在第0个维度扩张为num_clusters
  • (bs, visual_dim, 49) -> (num_clusters, bs, visual_dim, 49)
然后permute(1, 0, 2, 3)变换形状,得到 $x\_flatten$
  • (num_clusters, bs, visual_dim, 49) -> (bs, num_clusters, visual_dim, 49)
centroids=(num_cluters, visual_dim),同样expand(49, -1, -1)
  • (num_cluters, visual_dim) -> (49, num_cluters, visual_dim)
permute(1, 2, 0)
  • (49, num_cluters, visual_dim) -> (num_cluters, visual_dim, 49)
然后unsqueeze(0),得到 $centroids$
  • (num_cluters, visual_dim, 49) -> (1, num_cluters, visual_dim, 49)
将 $x\_flatten$ 减去 $controids$ 得到residual=(bs, num_clusters, visual_dim, 49)

residual乘上soft_assign.unsqueeze(2),得到新的residual

再对最后一个维度求和,得到p=(bs, num_clusters, visual_dim)

做一个intra-normalization层内的归一化,再变换形状view(bs, -1)
  • (bs, num_clusters, visual_dim) -> (bs, num_clusters * visual_dim)
最后做一个 $l2$ 归一化,返回Pseudo Region Features

soft_assign

pseudo-region features
def forward(self, grids):
    N, C = grids.shape[0], grids.shape[-1]
    grids = grids.view(N, 7, 7, -1).permute(0,3,1,2).contiguous()
    if self.normalize_input:
        # across descriptor dim
        grids = F.normalize(grids, p=2, dim=1)  
    soft_assign = self.conv(grids).view(N, self.num_regions, -1)
    soft_assign = F.softmax(soft_assign, dim=1)

    x_flatten = grids.view(N, C, -1)
    residual = x_flatten.expand(self.num_regions, -1, -1, -1).permute(1, 0, 2, 3).contiguous() - self.centroids.expand(x_flatten.size(-1), -1, -1).permute(1, 2, 0).contiguous().unsqueeze(0)
    residual *= soft_assign.unsqueeze(2)
    
    p = residual.sum(dim=-1)
    # intra-normalization
    p = F.normalize(p, p=2, dim=2)  
    p = p.view(grids.size(0), -1)
    # L2 normalize
    p = F.normalize(p, p=2, dim=1)  

    return p
现在有了新的特征Pseudo Region Features,跟原始网格特征一样,输入到encoder层,输出得到pseudo_region_enc_outputpseudo_region_mask_enc
  • pseudo_region_enc_output = (bs, num_clusters, d_model)
  • pseudo_region_mask_enc = (bs, 1, 1, num_clusters)
然后在第一维度$(dim=1)$将grid_enc_outputpseudo_region_enc_output拼接$($$concat)$到一起,得到encoder层的output
  • (bs, max_detections, d_model) + (bs, num_clusters, d_model) = (bs, max_detections+num_clusters, d_model)
同理,在最后一个维度$(dim=-1)$将grid_mask_encpseudo_region_mask_enc拼接到一起,得到mask
  • (bs, 1, 1, max_detections) + (bs, 1, 1, num_clusters) = (bs, 1, 1, max_detections+num_clusters)
编码增强的网格特征
解码总的特征
接下来就是将seq(就是caption)outputmask输入到decoder层,调用TransformerDecoderLayer.forward()方法

input的形状是(bs, seq_len)encoder_output的形状是(bs, max_detections+num_clusters)mask_encoder的形状是(bs, 1, 1, max_detections+num_clusters)

mask_queries是输入的seqpad矩阵,若不是pad,则对应位置值为True,反之则为False
  • 形状为(bs, seq_len, 1)
mask_self_attention为一个(seq_len, seq_len)的上三角矩阵,主对角线以上diagonal=1行的值全为1,剩余的元素全为0,再来两次unsqueeze(0)
  • (seq_len, seq_len) -> (1, 1, seq_len, seq_len)
mask_self_attention加上一个(bs, 1, 1, seq_len)的 $tensor$,包含一些[0,0,...,1,1,...,1]的矩阵,值为1代表是padding
  • (1, 1, seq_len, seq_len) + (bs, 1, 1, seq_len) = (bs, 1, seq_len, seq_len)
所以,新的mask_self_attention是一个值为0、1、2的四维 $tensor$,0代表能看见的 $token$ 或者已经预测出来的 $token$,1代表 $Mask$,2代表 $Padding$

对新的mask_self_attention做一个判断.gt(0),如果值严格大于0,则为True,否则为False

然后做一个判断,if self._is_stateful,这个只有beam_search时候

在定义TransformerDecoderLayer的时候,会初始化两个变量,它们会成为模型中的参数,存储在内存中,但不会参与随着梯度而更新,这样做的目的是不用每次都去计算这两个值,减少运算量
self.register_state('running_mask_self_attention', torch.zeros((1, 1, 0)).byte())
self.register_state('running_seq', torch.zeros((1,)).long())
在这里会进行判断,如果self._is_stateful=True,就从内存里面将running_mask_self_attention取出来,与现在的mask_self_attemtion在最后一维$($$dim=-1)$进行拼接,产生新的mask_self_attention

seq为从1seq_len+1的 $tensor$,expand扩张bs次,形状为(bs, seq_len)

判断mask_queries $tensor$ 中哪些位置的值为 $0$,若为 $0$,将seq中相同位置的值也置为 $0$

接下来将输入的input过一个word_embedding层,投影到d_model维度;同样的,将seq过一个position_embedding层,投影到d_model维度,再相加
  • inputseq都是(bs, seq_len) -> (bs, seq_len, d_model)
  • out = input + seq
def forward(self, input, encoder_output, mask_encoder):
    # input (bs, seq_len)
    b_s, seq_len = input.shape[:2]
    # (b_s, seq_len, 1)
    mask_queries = (input != self.padding_idx).unsqueeze(-1).float()  
    mask_self_attention = torch.triu(torch.ones((seq_len, seq_len), dtype=torch.uint8, device=input.device), diagonal=1)
    # (1, 1, seq_len, seq_len)
    mask_self_attention = mask_self_attention.unsqueeze(0).unsqueeze(0)  
    mask_self_attention = mask_self_attention + (input == self.padding_idx).unsqueeze(1).unsqueeze(1).byte()
    # (b_s, 1, seq_len, seq_len)
    mask_self_attention = mask_self_attention.gt(0)  
    if self._is_stateful:
        self.running_mask_self_attention = torch.cat([self.running_mask_self_attention.type_as(mask_self_attention), mask_self_attention], -1)
        mask_self_attention = self.running_mask_self_attention
    # (b_s, seq_len)        
    seq = torch.arange(1, seq_len + 1).view(1, -1).expand(b_s, -1).to(input.device)  
    seq = seq.masked_fill(mask_queries.squeeze(-1) == 0, 0)
        
    if self._is_stateful:
        self.running_seq.add_(1)
        seq = self.running_seq
    # embedding层
    out = self.word_emb(input) + self.pos_emb(seq)
    
    for i, l in enumerate(self.layers):
        out = l(out, encoder_output, mask_queries, mask_self_attention, mask_encoder)

    # (bs, seq_len, d_model) @ (d_model, vocab_size) = (bs, seq_len, vocab_size)
    out = self.fc(out)
    return F.log_softmax(out, dim=-1)
然后将out、encoder_out、mask_queries等等输入decoder层,共 $3$ 层,每层的输入和输出都相同;进入到DecoderLayer.forward()方法

同样的,这里是 $Transformer$ 的decoder实现,与原论文类似;其中,值得注意的是can_be_stateful这个参数在self_attenc_att不同
  • self_att = MultiHeadAttention(..., can_be_stateful=True, ...)
  • enc_att = MultiHeadAttention(..., can_be_stateful=False, ...)
前者为True,后者为False,这个参数代表是否将变量存储下来
def forward(self, input, enc_output, mask_pad, mask_self_att, mask_enc_att):
    # MHA+AddNorm
    self_att = self.self_att(input, input, input, mask_self_att)
    self_att = self.lnorm1(input + self.dropout1(self_att))
    # (bs, seq_len, d_model) * (bs, seq_len, 1) = (bs, seq_len, d_model)
    self_att = self_att * mask_pad
    # MHA+AddNorm:Image
    enc_att = self.enc_att(self_att, enc_output, enc_output, mask_enc_att)
    enc_att = self.lnorm2(self_att + self.dropout2(enc_att))
    enc_att = enc_att * mask_pad

    ff = self.pwff(enc_att)
    ff = ff * mask_pad
    return ff
inputmask_self_att等输入到self.att层,就进入了MultiHeadAttention.forward()方法,
def forward(self, queries, keys, values, attention_mask=None, attention_weights=None):
    if self.can_be_stateful and self._is_stateful:
        self.running_keys = torch.cat([self.running_keys, keys], 1)
        keys = self.running_keys

        self.running_values = torch.cat([self.running_values, values], 1)
        values = self.running_values

    if self.identity_map_reordering:
        q_norm = self.layer_norm(queries)
        k_norm = self.layer_norm(keys)
        v_norm = self.layer_norm(values)
        out = self.attention(q_norm, k_norm, v_norm, attention_mask, attention_weights)
        out = queries + self.dropout(torch.relu(out))
    else:
        out = self.attention(queries, keys, values, attention_mask, attention_weights)
        out = self.dropout(out)
        out = self.layer_norm(queries + out)
    return out
MultiHeadAttention()定义的时候,会进行判断
  • can_be_stateful=True,则将running_keysrunning_values存储下来,形状为(0, d_model)
self.can_be_stateful = can_be_stateful
if self.can_be_stateful:
    self.register_state('running_keys', torch.zeros((0, d_model)))
    self.register_state('running_values', torch.zeros((0, d_model)))
如果self.identity_map_reordering=True,则是先做layer normalization,后做attention,这里就有一个post lnormpre lnorm的概念

最后返回decoderoutput,在class train_xe()中与caption_gt计算 $loss$,反向传播,更新参数

解码总的特征

SCST训练阶段

在 $SCST$ 训练阶段,使用 $self\ critical$ 强化学习中的策略梯度方法

dataloader是之前定义的dict_dataloader_train,对于dataloader遍历,得到iterationdetectionscaps_gtcaptions
  • detections: Tensor(bs, seq_len, visual_dim)
  • caps_gt: list(bs),包含了batch_sizelist,每个list里面存着 $5$ 条caption: str,对应着每张图片的 $5$ 条描述
  • captions: Tensor(bs, seq_len),存着caption对应的index
def train_scst(model, dataloader, optim, cider, text_field,  scheduler_rl, e):
    # Training with self-critical
    tokenizer_pool = multiprocessing.Pool()
    running_reward = .0
    running_reward_baseline = .0

    model.train()
    scheduler_rl.step()
    print('lr = ', optim.state_dict()['param_groups'][0]['lr'])

    running_loss = .0
    seq_len = 20
    beam_size = 5
    # kwargs = {
    #     'text_flag': args.text2text
    # }
    with tqdm(desc='Epoch %d - train' % e, unit='it', total=len(dataloader)) as pbar:
        for it, (detections, caps_gt, captions) in enumerate(dataloader):
            detections = detections.to(device)
            # text = captions.to(device)
            # kwargs['text'] = text
            outs, log_probs = model(mode='rl', images=detections, max_len=seq_len, eos_idx=text_field.vocab.stoi['<eos>'], beam_size=beam_size, out_size=beam_size)
            optim.zero_grad()
            # Rewards
            caps_gen = text_field.decode(outs.view(-1, seq_len))
            caps_gt = list(itertools.chain(*([c, ] * beam_size for c in caps_gt)))
            caps_gen, caps_gt = tokenizer_pool.map(evaluation.PTBTokenizer.tokenize, [caps_gen, caps_gt])

            reward = cider.compute_score(caps_gt, caps_gen)[1].astype(np.float32)
            reward = torch.from_numpy(reward).to(device).view(detections.shape[0], beam_size)
            reward_baseline = torch.mean(reward, -1, keepdim=True)
            loss = -torch.mean(log_probs, -1) * (reward - reward_baseline)

            loss = loss.mean()
            loss.backward()
            optim.step()

            running_loss += loss.item()
            running_reward += reward.mean().item()
            running_reward_baseline += reward_baseline.mean().item()
            pbar.set_postfix(loss=running_loss / (it + 1), reward=running_reward / (it + 1), reward_baseline=running_reward_baseline / (it + 1))
            pbar.update()

    loss = running_loss / len(dataloader)
    reward = running_reward / len(dataloader)
    reward_baseline = running_reward_baseline / len(dataloader)

    return loss, reward, reward_baseline
modedetectionsmax_seq_lenbeam_size等等喂给model(),进入到Transformer.forward()方法;判断是rl模式,初始化BeamSearch,进入apply方法
elif mode == 'rl':
    bs = BeamSearch(self, max_len, eos_idx, beam_size)
    return bs.apply(images, out_size, return_probs)
后面就是 $Beam\ Search$ 算法,每次选择 $TopK$ 的 $sequence$,最后组成的一句话就是最终的输出 $Caption$

model()返回outslog_prob,前者就是tokenindex;生成对应的sequence,与caps_gt计算rewardreward_baseline,然后计算 $loss$,反向传播更新参数

一个 $epoch$ 结束后,计算 $Validation\ Loss$ 与各指标
# Validation loss
val_loss = evaluate_loss(model, dataloader_val, loss, text_field, e)
writer.add_scalar('data/val_loss', val_loss, e)
# Validation scores
scores = evaluate_metrics(model, dict_dataloader_val, text_field, e)
val_cider = scores['CIDEr']
print("Validation scores", scores)
writer.add_scalar('data/val_cider', val_cider, e)
writer.add_scalar('data/val_bleu1', scores['BLEU'][0], e)
writer.add_scalar('data/val_bleu4', scores['BLEU'][3], e)
writer.add_scalar('data/val_meteor', scores['METEOR'], e)
writer.add_scalar('data/val_rouge', scores['ROUGE'], e)
还有 $Test\ Scores$
# Test scores
scores = evaluate_metrics(model, dict_dataloader_test, text_field, e)
test_cider = scores['CIDEr']
print("Test scores", scores)
writer.add_scalar('data/test_cider', test_cider, e)
writer.add_scalar('data/test_bleu1', scores['BLEU'][0], e)
writer.add_scalar('data/test_bleu4', scores['BLEU'][3], e)
writer.add_scalar('data/test_meteor', scores['METEOR'], e)
writer.add_scalar('data/test_rouge', scores['ROUGE'], e)
为下一个 $epoch$ 的设置
# Prepare for next epoch
best = False
if val_cider >= best_cider:
    best_cider = val_cider
    patience = 0
    best = True
else:
    patience += 1

best_test = False
if test_cider >= best_test_cider:
    best_test_cider = test_cider
    best_test = True

switch_to_rl = False
exit_train = False
设置 $XE$ 训练阶段的结束, $SCST$ 训练阶段的开始和整个训练阶段的结束
if patience == 5:
    # xe stage train 15 epoches at least 
    if e < args.xe_least:
        print('special treatment, e = {}'.format(e))
        use_rl = False
        switch_to_rl = False
        patience = 0
    elif not use_rl:
        use_rl = True
        switch_to_rl = True
        patience = 0
        optim_rl = Adam(model.parameters(), lr=1, betas=(0.9, 0.98))
        scheduler_rl = LambdaLR(optim_rl, lambda_lr_rl)
        for k in range(e-1):
            scheduler_rl.step()
            print("Switching to RL")
        else:
            print('patience reached.')
            exit_train = True

    if e == args.xe_most:
        # xe stage no more than 20 epoches
        if not use_rl:
            use_rl = True
            switch_to_rl = True
            patience = 0
            optim_rl = Adam(model.parameters(), lr=1, betas=(0.9, 0.98))
            scheduler_rl = LambdaLR(optim_rl, lambda_lr_rl)
            for k in range(e-1):
                scheduler_rl.step()
            print("Switching to RL")            
为转到 $SCST$ 继续训练设置随机种子和模型参数
if switch_to_rl and not best:
    data = torch.load(os.path.join(args.dir_to_save_model, '%s_best.pth' % args.exp_name))
    torch.set_rng_state(data['torch_rng_state'])
    torch.cuda.set_rng_state(data['cuda_rng_state'])
    np.random.set_state(data['numpy_rng_state'])
    random.setstate(data['random_rng_state'])
    model.load_state_dict(data['state_dict'])
    print('Resuming from epoch %d, validation loss %f, best_cider %f, and best test_cider %f' % (data['epoch'], data['val_loss'], data['best_cider'], data['best_test_cider']))
保存该 $epoch$ 的参数
torch.save({
    'torch_rng_state': torch.get_rng_state(),
    'cuda_rng_state': torch.cuda.get_rng_state(),
    'numpy_rng_state': np.random.get_state(),
    'random_rng_state': random.getstate(),
    'epoch': e,
    'val_loss': val_loss,
    'val_cider': val_cider,
    'state_dict': model.state_dict(),
    'optimizer': optim.state_dict() if not use_rl else optim_rl.state_dict(),
    'scheduler': scheduler.state_dict() if not use_rl else scheduler_rl.state_dict(),
    'patience': patience,
    'best_cider': best_cider,
    'best_test_cider': best_test_cider,
    'use_rl': use_rl,
}, os.path.join(args.dir_to_save_model, '%s_last.pth' % args.exp_name))
保存模型,$Validation$ 和 $Test$ 的最终模型和最优模型都保存下来
if best:
    copyfile(os.path.join(args.dir_to_save_model, '%s_last.pth' % args.exp_name), os.path.join(args.dir_to_save_model, '%s_best.pth' % args.exp_name))
if best_test:
    copyfile(os.path.join(args.dir_to_save_model, '%s_last.pth' % args.exp_name), os.path.join(args.dir_to_save_model, '%s_best_test.pth' % args.exp_name))
训练结束,退出
if exit_train:
    writer.close()
    break

参考

最后编辑于:2022 年 10 月 09 日 09:28
邮箱格式错误
网址请用http://或https://开头