$S^2-Transformer$ 代码运行解析(基于 $M^2$ )
我改成了单 $Gpu$ 运行,忽略 $DDP\ Training$
TODO
运行main
train_trasnformer.py
中,定义的多个可选参数,存在
args
里
device = torch.device('cuda')
parser = argparse.ArgumentParser(description="Transformer")
parser.add_argument('--exp_name', type=str, default='s2')
...
args = parser.parse_args()
打印一下
print(args)
'''
annotation_folder='Data/annotations'
batch_size=50
dir_to_save_model='checkpoint/'
exp_name='s2'
features_path='Data/X101_grid_feats_coco_trainval.hdf5'
head=8
logs_folder='tensorboard_logs'
m=40 # 没使用
num_clusters=5
refine_epoch_rl=28
resume_best=False
resume_last=False
rl_base_lr=5e-06
text2text=0 # 没使用
warmup=10000
workers=0
xe_base_lr=0.0001
xe_least=15
xe_most=20
'''
放进
Train
函数
train(args)
预处理,判断存模型
dir_to_save_model
、
tensorboard_logs
的目录是否存在,若不存在,则新建
# preparation
if not os.path.exists(args.dir_to_save_model):
os.makedirs(args.dir_to_save_model)
if not os.path.exists(args.logs_folder):
os.makedirs(args.logs_folder)
创建
tensorboard
,可视化训练过程
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir=os.path.join(args.logs_folder, args.exp_name))
创建
image_field
,表示图像特征的类
# Pipeline for image regions
image_field = ImageDetectionsField(detections_path=args.features_path, max_detections=49, load_in_tmp=False)
创建
text_field
,表示文本标注的类
# Pipeline for text
text_field = TextField(init_token='<bos>', eos_token='<eos>', lower=True, tokenize='spacy', remove_punctuation=True, nopoints=False)
创建数据集,这里采用
COCO
数据集
# Create the dataset
dataset = COCO(image_field, text_field, 'coco/images/', args.annotation_folder, args.annotation_folder)
train_dataset, val_dataset, test_dataset = dataset.splits
创建
vocab.pkl
,字典,包含 $10201$ 个字符或字符串
- 若存在,直接
load
- 若不存在,则以出现次数大于 $5$ 次为条件,新建词表
if not os.path.isfile('vocab.pkl'):
print("Building vocabulary")
text_field.build_vocab(train_dataset, val_dataset, min_freq=5)
pickle.dump(text_field.vocab, open('vocab.pkl', 'wb'))
else:
print('Loading from vocabulary')
text_field.vocab = pickle.load(open('vocab.pkl', 'rb'))
建模
创建
model
encoder
有自定义的参数
EncoderLayer
为 $3$ 层<pad>
这个 $token$ 的 index
,为 $0$- 注意力模块
attention_module
为原始的ScaledDotProductAttention
attention_module_kwargs
中的m=args.m=40
并没有在这篇论文中使用,这是在 $M^2$中出现的超参数
decoder
中的参数
- 筛选后语料库词表的长度,为 $10201$
- 最长的句子长度为 $54$
DecoderLayer
为 $3$ 层<pad>
这个 $token$ 的index
,为 $1$
Transformer
的参数
<bos>
的index
- 上面定义的
encoder
- 上面定义的
decoder
- 超参数,聚类的数目,文章中为 $5$
vocab
的长度max_len
为 $54$<pad>
的index
text_dimension
,投影的维度为 $512$
# Model and dataloaders
encoder = TransformerEncoder(3, 0, attention_module=ScaledDotProductAttention, attention_module_kwargs={'m': args.m})
decoder = TransformerDecoderLayer(len(text_field.vocab), 54, 3, text_field.vocab.stoi['<pad>']
model = Transformer(text_field.vocab.stoi['<bos>'], encoder, decoder, args.num_clusters, len(text_field.vocab), 54, text_field.vocab.stoi['<pad>'], 512).to(device)
创建
dataset
的
dict
,在 $SCST$ 训练阶段使用
dict_dataset_train = train_dataset.image_dictionary({'image': image_field, 'text': RawField(), 'add_text': text_field})
dict_dataset_val = val_dataset.image_dictionary({'image': image_field, 'text': RawField(), 'add_text': text_field})
dict_dataset_test = test_dataset.image_dictionary({'image': image_field, 'text': RawField(), 'add_text': text_field})
创建
ref_caps_train
,是一个
list
,储存训练时的
references
$($或者叫 $labels)$,全是 $images$ 对应的 $sentences(str)$
ref_caps_train = list(train_dataset.text())
然后
tokenizer.py
的
class PTBTokenizer(object)
对
ref_caps_train
进行处理
用到了 $Stanford\ corenlp$ 的 $Java\ jar$ 包,
return
的是一个
dict
,包含所有的
caps
,
caps
是一个个的
list
,每个
list
包含一串
str
把这个
dict
丢到
class Cider()
中,产生一个
cider_train
对象,用于
$SCST$ 的
cider
指标优化
cider_train = Cider(PTBTokenizer.tokenize(ref_caps_train))
接下来定义了两个训练阶段, $XE$ 和 $SCST$ 的学习率变换方法
def lambda_lr(s):
print("s:", s)
if s <= 3:
lr = args.xe_base_lr * s / 4
elif s <= 10:
lr = args.xe_base_lr
elif s <= 12:
lr = args.xe_base_lr * 0.2
else:
lr = args.xe_base_lr * 0.2 * 0.2
return lr
def lambda_lr_rl(s):
refine_epoch = args.refine_epoch_rl
print("rl_s:", s)
if s <= refine_epoch:
lr = args.rl_base_lr
elif s <= refine_epoch + 3:
lr = args.rl_base_lr * 0.2
elif s <= refine_epoch + 6:
lr = args.rl_base_lr * 0.2 * 0.2
else:
lr = args.rl_base_lr * 0.2 * 0.2 * 0.2
return lr
初始化条件 $Initial\ Conditions$
optim = Adam(model.parameters(), lr=1, betas=(0.9, 0.98))
scheduler = LambdaLR(optim, lambda_lr)
optim_rl = Adam(model.parameters(), lr=1, betas=(0.9, 0.98))
scheduler_rl = LambdaLR(optim_rl, lambda_lr_rl)
# <pad>这个token不参加计算loss
loss_fn = NLLLoss(ignore_index=text_field.vocab.stoi['<pad>'])
# 第二个loss
loss_align = MSELoss()
loss = (loss_fn, loss_align)
use_rl = False
best_cider = .0
best_test_cider = 0.
patience = 0
start_epoch = 0
设置断点继续训练
if args.resume_last or args.resume_best:
if args.resume_last:
fname = os.path.join(args.dir_to_save_model, '%s_last.pth' % args.exp_name)
else:
fname = os.path.join(args.dir_to_save_model, '%s_best.pth' % args.exp_name)
# fname = 'checkpoint/s2_last.pth'
if os.path.exists(fname):
print("load model {}".format(fname))
data = torch.load(fname)
torch.set_rng_state(data['torch_rng_state'])
torch.cuda.set_rng_state(data['cuda_rng_state'])
np.random.set_state(data['numpy_rng_state'])
random.setstate(data['random_rng_state'])
model.load_state_dict(data['state_dict'], strict=False)
"""
optim.load_state_dict(data['optimizer'])
scheduler.load_state_dict(data['scheduler'])
"""
start_epoch = data['epoch'] + 1
best_cider = data['best_cider']
best_test_cider = data['best_test_cider']
patience = data['patience']
use_rl = data['use_rl']
if use_rl:
optim.load_state_dict(data['optimizer'])
scheduler.load_state_dict(data['scheduler'])
else:
optim_rl.load_state_dict(data['optimizer'])
scheduler_rl.load_state_dict(data['scheduler'])
print('Resuming from epoch %d, validation loss %f, best cider %f, and best_test_cider %f' % (data['epoch'], data['val_loss'], data['best_cider'], data['best_test_cider']))
print('patience:', data['patience'])
else:
print("no load model")
开始训练
print("Training starts")
for e in range(start_epoch, start_epoch + 100):
...
创建
dataloader
,用在 $XE$ 训练阶段
dataloader_train = DataLoader(train_dataset, batch_size=args.batch_size, pin_memory=True, drop_last=False, num_workers=args.workers, shuffle=True, persistent_workers=True)
dataloader_val = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers)
创建
dict_dataloader
,用在 $SCST$ 训练阶段
dict_dataloader_train = DataLoader(dict_dataset_train, batch_size=args.batch_size // 5, pin_memory=True, drop_last=False, num_workers=args.workers, persistent_workers=True)
dict_dataloader_val = DataLoader(dict_dataset_val, batch_size=args.batch_size // 5)
dict_dataloader_test = DataLoader(dict_dataset_test, batch_size=args.batch_size // 5)
判断当前训练阶段,两个阶段的
loss function
并不相同,并记录不同 $epoch$ 的 $logs$
if not use_rl:
train_loss = train_xe(model, dataloader_train, optim, text_field)
writer.add_scalar('data/train_loss', train_loss, e)
else:
train_loss, reward, reward_baseline = train_scst(model, dict_dataloader_train, optim, cider_train, text_field)
writer.add_scalar('data/train_loss', train_loss, e)
writer.add_scalar('data/reward', reward, e)
writer.add_scalar('data/reward_baseline', reward_baseline, e)
在 $XE$ 训练阶段,使用 $Cross\ Entropy\ Loss$ = $NLLLoss$ + $Log\_Softmax$
将
model、dataloader、optim
等等,输入到训练函数中,求得一次 $epoch$ 迭代更新后的训练 $loss$
首先遍历
dataloader
,得到
iteration
,
detections(就是key)
和
captions(就是value)
,
detections
的形状为
(bs, max_detections, visual_dim)
把
detections
作为
images
,
captions
作为
sequences
,输入给模型
model
;将梯度置 $0$;从
第二个单词开始截取
captions
,也就是对应
captions[:, 1:]
,
captions
是
(batch_size, seq_len)
的形状,保证截取的部分数据是一个连续内存,否则新开辟一块内存存储数据,将截取数据命名为
captions_gt
,也就是
ground\ truth
,它的形状是
(batch_size, seq_len-1)
。因为要预测
captions
,所以正好是
错位的,在
sequence
中,上一个
token
预测下一个
token
out
是模型的输出,为
(batch_size, seq_len, vocab_len)
的形状,然后截断
seq_len
最后一个
token
,因为要与
captions_gt
计算 $loss$,所以要截断一个
token
,保存为
(batch_size, seq_len-1, vocab_len)
把
out
变换为
(batch_size * (seq_len-1), vocab_len)
,把
captions_gt
变换为
(batch_size * (seq_len-1))
,输入到
loss_function
,这里的损失函数是
NLLLoss()
,计算 $loss$,然后反向传播更新参数
out
= (bs * (seq_len-1), vocab_len)
captions_gt
= (bs * (seq_len-1))
将
captions_gt
视为 $labels$,是一维的 $tensor$;
out
是最后一层
decoder
的输出经过
log_softmax
的结果,经过
softmax
转为 $0\sim1$ 的正数,再取
log
后变为 $-\infty\sim0$ 的负数
pbar
在每次计算后,动态显示进度条
def train_xe(model, dataloader, optim, text_field, scheduler, loss_fn, e):
# Training with cross-entropy
model.train()
scheduler.step()
# show learning late
print('lr = ', optim.state_dict()['param_groups'][0]['lr'])
running_loss = .0
with tqdm(desc='Epoch %d - train' % e, unit='it', total=len(dataloader)) as pbar:
for it, (detections, captions) in enumerate(dataloader):
detections, captions = detections.to(device), captions.to(device)
out = model(mode='xe', images=detections, seq=captions)
optim.zero_grad()
captions_gt = captions[:, 1:].contiguous()
out = out[:, :-1].contiguous()
loss = loss_fn(out.view(-1, len(text_field.vocab)), captions_gt.view(-1))
loss.backward()
optim.step()
this_loss = loss.item()
running_loss += this_loss
pbar.set_postfix(loss=running_loss / (it + 1))
pbar.update()
loss = running_loss / len(dataloader)
return loss
XE训练阶段
对于
XE
交叉熵训练,将
images
喂给
encoder(TransformerEncoder)
,首先
super().__init__
,初始化各种属性,例如
self.d_model、self.SR
等,然后进入
forward()
函数
可以把该阶段的这些步骤分为三个部分
if mode == 'xe':
# images = (batch_size, max_detections, dim_visual)
bs, _, vis_dim = images.size()
# Grid feature 编码网格特征
grid_enc_output, grid_mask_enc = self.encoder(images)
# Pseudo-region feature 编码增强的网格特征
# (N, num_clusters*2048) -> (N, num_clusters, 2048)
pseudo_region = self.SP(images).view(bs, -1, vis_dim)
pseudo_region_enc_output, pseudo_region_mask_enc = self.encoder(pseudo_region)
output, mask = torch.cat([grid_enc_output, pseudo_region_enc_output], dim=1), torch.cat([grid_mask_enc, pseudo_region_mask_enc], dim=-1)
# 解码总的特征
dec_output = self.decoder(seq, output, mask)
return dec_output
编码网格特征
TransformerEncoder.forward()
函数对
input
也就是
images
处理,包括
mask、fc、dropout
等,然后通过
super().forward()
调用父类
MultiLevelEncoder.forward()
方法
def forward(self, input, attention_weights=None):
mask = (torch.sum(input, dim=-1) == 0).unsqueeze(-1)
out = F.relu(self.fc(input))
out = self.dropout(out)
out = self.layer_norm(out)
out = out.masked_fill(mask, 0)
# out (bs, max_dections, d_model)
return super(TransformerEncoder, self).forward(out, attention_weights=attention_weights)
下面是父类
MultiLevelEncoder
的
forward()
方法,得到
attention_mask
,喂给 $Scale-aware\ \ Reinforcement(SR)$ 模块
def forward(self, input, attention_weights=None):
# input = (bs, max_detections, d_model)
# attention_mask = (bs, 1, 1, max_detections)
attention_mask = (torch.sum(input, -1) == self.padding_idx).unsqueeze(1).unsqueeze(1)
out = self.SR(input, self.layers, attention_mask, attention_weights)
return out, attention_mask
在 $SR$ 模块的
forward()
函数进行处理,利用注意力机制提取输入的
x
的特征,提取特征的语义信息。并编码

def forward(self, x, layers, attention_mask = None, attention_weights = None):
out = x
outs = []
for l in layers:
out = l(out, out, out, attention_mask, attention_weights)
outs.append(out)
outs = self.MLP(torch.cat(outs, -1))
out = 0.2 * outs + out
return out
将
out、attention_mask
等等输入到
encoder
层,共有 $3$ 层,每层的输入输出都相同;进入到
EncoderLayer.forward()
方法
查考了论文
Attention Is All You Need,这里就是
encoder
层前向传播的实现
def forward(self, queries, keys, values, attention_mask=None, attention_weights=None):
att = self.mhatt(queries, keys, values, attention_mask, attention_weights)
att = self.lnorm(queries + self.dropout(att))
ff = self.pwff(att)
return ff
其中的
layers
每层的结构如下
(0): EncoderLayer(
(mhatt): MultiHeadAttention(
(attention): ScaledDotProductAttention(
(fc_q): Linear(in_features=512, out_features=512, bias=True)
(fc_k): Linear(in_features=512, out_features=512, bias=True)
(fc_v): Linear(in_features=512, out_features=512, bias=True)
(fc_o): Linear(in_features=512, out_features=512, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(dropout): Dropout(p=0.1, inplace=False)
(layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(dropout): Dropout(p=0.1, inplace=False)
(lnorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(pwff): PositionWiseFeedForward(
(fc1): Linear(in_features=512, out_features=2048, bias=True)
(fc2): Linear(in_features=2048, out_features=512, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(dropout_2): Dropout(p=0.1, inplace=False)
(layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
)
将每一层编码器的输出都
concat
在一起,然后通过
MLP
衡量各层特征的 $Contributions$ 后得到
outs
,这个
outs
包含了更 $low\ level$ 的语义信息,给它赋予权重后加在原特征上面,得到增强的网格编码特征
- 输出的
out
= (bs, max_detections, d_model)
attention_mask
= (bs, 1, 1, max_detections)
最终
self.encoder
返回
out
和
attentions_mask
,命名为
grid_enc_output
和
grid_mask_enc

编码增强的网格特征
接下来,将原先的
images
作为网格特征输入给 $Spatial-aware\ \ Pseudo-supervised(SP)$ 模块,进行聚类,得到作者提出的 $Pseudo-region\ Features$
首先进入
SP
模块的
forward()
方法,获取到网格特征的
batch_size
和
visual_dim(就是C_in)
,原本的网格特征
grid
的形状为
(batch_size, max_detection_region=49, visual_dim)
,然后将
grid
变形为
(batch_size, visual_dim, 7, 7)
对特征
grid
进行 $l2$ 归一化,然后做卷积运算
Conv2d
(bs, C_in=visual_dim=2048, h=7, w=7)
-> (bs, C_out=num_clusters=5, h=7, w=7)
接着继续变换形状
(bs, num_clusters=5, h=7, w=7)
-> (bs, num_clusters, 49)
然后做一个
Softmax
,得到
soft_assign
令
x_flatten=(bs, visual_dim, -1)=(bs, visual_dim, 49)
,再
expand(num_clusters, -1, -1, -1)
,在第
0
个维度扩张为
num_clusters
(bs, visual_dim, 49)
-> (num_clusters, bs, visual_dim, 49)
然后
permute(1, 0, 2, 3)
变换形状,得到 $x\_flatten$
(num_clusters, bs, visual_dim, 49)
-> (bs, num_clusters, visual_dim, 49)
令
centroids=(num_cluters, visual_dim)
,同样
expand(49, -1, -1)
(num_cluters, visual_dim)
-> (49, num_cluters, visual_dim)
再
permute(1, 2, 0)
(49, num_cluters, visual_dim)
-> (num_cluters, visual_dim, 49)
然后
unsqueeze(0)
,得到 $centroids$
(num_cluters, visual_dim, 49)
-> (1, num_cluters, visual_dim, 49)
将 $x\_flatten$ 减去 $controids$ 得到
residual=(bs, num_clusters, visual_dim, 49)
residual
乘上
soft_assign.unsqueeze(2)
,得到新的
residual
再对最后一个维度求和,得到
p=(bs, num_clusters, visual_dim)
做一个
intra-normalization
层内的归一化,再变换形状
view(bs, -1)
(bs, num_clusters, visual_dim)
-> (bs, num_clusters * visual_dim)
最后做一个 $l2$ 归一化,返回
Pseudo Region Features

def forward(self, grids):
N, C = grids.shape[0], grids.shape[-1]
grids = grids.view(N, 7, 7, -1).permute(0,3,1,2).contiguous()
if self.normalize_input:
# across descriptor dim
grids = F.normalize(grids, p=2, dim=1)
soft_assign = self.conv(grids).view(N, self.num_regions, -1)
soft_assign = F.softmax(soft_assign, dim=1)
x_flatten = grids.view(N, C, -1)
residual = x_flatten.expand(self.num_regions, -1, -1, -1).permute(1, 0, 2, 3).contiguous() - self.centroids.expand(x_flatten.size(-1), -1, -1).permute(1, 2, 0).contiguous().unsqueeze(0)
residual *= soft_assign.unsqueeze(2)
p = residual.sum(dim=-1)
# intra-normalization
p = F.normalize(p, p=2, dim=2)
p = p.view(grids.size(0), -1)
# L2 normalize
p = F.normalize(p, p=2, dim=1)
return p
现在有了新的特征
Pseudo Region Features
,跟原始网格特征一样,输入到
encoder
层,输出得到
pseudo_region_enc_output
和
pseudo_region_mask_enc
pseudo_region_enc_output
= (bs, num_clusters, d_model)
pseudo_region_mask_enc
= (bs, 1, 1, num_clusters)
然后在第一维度$(dim=1)$将
grid_enc_output
和
pseudo_region_enc_output
拼接$($$concat)$到一起,得到
encoder
层的
output
(bs, max_detections, d_model)
+ (bs, num_clusters, d_model)
= (bs, max_detections+num_clusters, d_model)
同理,在最后一个维度$(dim=-1)$将
grid_mask_enc
和
pseudo_region_mask_enc
拼接到一起,得到
mask
(bs, 1, 1, max_detections)
+ (bs, 1, 1, num_clusters)
= (bs, 1, 1, max_detections+num_clusters)

解码总的特征
接下来就是将
seq(就是caption)
、
output
、
mask
输入到
decoder
层,调用
TransformerDecoderLayer.forward()
方法
input
的形状是
(bs, seq_len)
,
encoder_output
的形状是
(bs, max_detections+num_clusters)
,
mask_encoder
的形状是
(bs, 1, 1, max_detections+num_clusters)
mask_queries
是输入的
seq
的
非pad
矩阵,若不是
pad
,则对应位置值为
True
,反之则为
False
令
mask_self_attention
为一个
(seq_len, seq_len)
的上三角矩阵,主对角线以上
diagonal=1
行的值全为
1
,剩余的元素全为
0
,再来两次
unsqueeze(0)
(seq_len, seq_len)
-> (1, 1, seq_len, seq_len)
让
mask_self_attention
加上一个
(bs, 1, 1, seq_len)
的 $tensor$,包含一些
[0,0,...,1,1,...,1]
的矩阵,值为
1
代表是
padding
(1, 1, seq_len, seq_len)
+ (bs, 1, 1, seq_len)
= (bs, 1, seq_len, seq_len)
所以,新的
mask_self_attention
是一个值为
0、1、2
的四维 $tensor$,
0
代表能看见的 $token$ 或者已经预测出来的 $token$,
1
代表 $Mask$,
2
代表 $Padding$
对新的
mask_self_attention
做一个判断
.gt(0)
,如果值严格大于
0
,则为
True
,否则为
False
然后做一个判断,
if self._is_stateful
,这个只有
beam_search
的
时候?
在定义
TransformerDecoderLayer
的时候,会初始化两个变量,它们会成为模型中的参数,存储在内存中,但不会参与随着梯度而更新,这样做的目的是不用每次都去计算这两个值,减少运算量
self.register_state('running_mask_self_attention', torch.zeros((1, 1, 0)).byte())
self.register_state('running_seq', torch.zeros((1,)).long())
在这里会进行判断,如果
self._is_stateful=True
,就从内存里面将
running_mask_self_attention
取出来,与现在的
mask_self_attemtion
在最后一维$($$dim=-1)$进行拼接,产生新的
mask_self_attention
令
seq
为从
1
到
seq_len+1
的 $tensor$,
expand
扩张
bs
次,形状为
(bs, seq_len)
判断
mask_queries
$tensor$ 中哪些位置的值为 $0$,若为 $0$,将
seq
中相同位置的值也置为 $0$
接下来将输入的
input
过一个
word_embedding
层,投影到
d_model
维度;同样的,将
seq
过一个
position_embedding
层,投影到
d_model
维度,再相加
input
和seq
都是(bs, seq_len)
-> (bs, seq_len, d_model)
out
= input
+ seq
def forward(self, input, encoder_output, mask_encoder):
# input (bs, seq_len)
b_s, seq_len = input.shape[:2]
# (b_s, seq_len, 1)
mask_queries = (input != self.padding_idx).unsqueeze(-1).float()
mask_self_attention = torch.triu(torch.ones((seq_len, seq_len), dtype=torch.uint8, device=input.device), diagonal=1)
# (1, 1, seq_len, seq_len)
mask_self_attention = mask_self_attention.unsqueeze(0).unsqueeze(0)
mask_self_attention = mask_self_attention + (input == self.padding_idx).unsqueeze(1).unsqueeze(1).byte()
# (b_s, 1, seq_len, seq_len)
mask_self_attention = mask_self_attention.gt(0)
if self._is_stateful:
self.running_mask_self_attention = torch.cat([self.running_mask_self_attention.type_as(mask_self_attention), mask_self_attention], -1)
mask_self_attention = self.running_mask_self_attention
# (b_s, seq_len)
seq = torch.arange(1, seq_len + 1).view(1, -1).expand(b_s, -1).to(input.device)
seq = seq.masked_fill(mask_queries.squeeze(-1) == 0, 0)
if self._is_stateful:
self.running_seq.add_(1)
seq = self.running_seq
# embedding层
out = self.word_emb(input) + self.pos_emb(seq)
for i, l in enumerate(self.layers):
out = l(out, encoder_output, mask_queries, mask_self_attention, mask_encoder)
# (bs, seq_len, d_model) @ (d_model, vocab_size) = (bs, seq_len, vocab_size)
out = self.fc(out)
return F.log_softmax(out, dim=-1)
然后将
out、encoder_out、mask_queries
等等输入
decoder
层,共 $3$ 层,每层的输入和输出都相同;进入到
DecoderLayer.forward()
方法
同样的,这里是 $Transformer$ 的
decoder
实现,与原论文类似;其中,值得注意的是
can_be_stateful
这个参数在
self_att
和
enc_att
不同
self_att = MultiHeadAttention(..., can_be_stateful=True, ...)
enc_att = MultiHeadAttention(..., can_be_stateful=False, ...)
前者为
True
,后者为
False
,这个参数代表是否将变量存储下来
def forward(self, input, enc_output, mask_pad, mask_self_att, mask_enc_att):
# MHA+AddNorm
self_att = self.self_att(input, input, input, mask_self_att)
self_att = self.lnorm1(input + self.dropout1(self_att))
# (bs, seq_len, d_model) * (bs, seq_len, 1) = (bs, seq_len, d_model)
self_att = self_att * mask_pad
# MHA+AddNorm:Image
enc_att = self.enc_att(self_att, enc_output, enc_output, mask_enc_att)
enc_att = self.lnorm2(self_att + self.dropout2(enc_att))
enc_att = enc_att * mask_pad
ff = self.pwff(enc_att)
ff = ff * mask_pad
return ff
将
input
和
mask_self_att
等输入到
self.att
层,就进入了
MultiHeadAttention.forward()
方法,
def forward(self, queries, keys, values, attention_mask=None, attention_weights=None):
if self.can_be_stateful and self._is_stateful:
self.running_keys = torch.cat([self.running_keys, keys], 1)
keys = self.running_keys
self.running_values = torch.cat([self.running_values, values], 1)
values = self.running_values
if self.identity_map_reordering:
q_norm = self.layer_norm(queries)
k_norm = self.layer_norm(keys)
v_norm = self.layer_norm(values)
out = self.attention(q_norm, k_norm, v_norm, attention_mask, attention_weights)
out = queries + self.dropout(torch.relu(out))
else:
out = self.attention(queries, keys, values, attention_mask, attention_weights)
out = self.dropout(out)
out = self.layer_norm(queries + out)
return out
在
MultiHeadAttention()
定义的时候,会进行判断
- 若
can_be_stateful=True
,则将running_keys
和running_values
存储下来,形状为(0, d_model)
self.can_be_stateful = can_be_stateful
if self.can_be_stateful:
self.register_state('running_keys', torch.zeros((0, d_model)))
self.register_state('running_values', torch.zeros((0, d_model)))
如果
self.identity_map_reordering=True
,则是先做
layer normalization
,后做
attention
,这里就有一个
post lnorm
和
pre lnorm
的概念
最后返回
decoder
的
output
,在
class train_xe()
中与
caption_gt
计算 $loss$,反向传播,更新参数

SCST训练阶段
在 $SCST$ 训练阶段,使用 $self\ critical$ 强化学习中的策略梯度方法
dataloader
是之前定义的
dict_dataloader_train
,对于
dataloader
遍历,得到
iteration
,
detections
,
caps_gt
和
captions
,
detections: Tensor
,(bs, seq_len, visual_dim)
caps_gt: list
,(bs)
,包含了batch_size
个list
,每个list
里面存着 $5$ 条caption: str
,对应着每张图片的 $5$ 条描述captions: Tensor
,(bs, seq_len)
,存着caption
对应的index
def train_scst(model, dataloader, optim, cider, text_field, scheduler_rl, e):
# Training with self-critical
tokenizer_pool = multiprocessing.Pool()
running_reward = .0
running_reward_baseline = .0
model.train()
scheduler_rl.step()
print('lr = ', optim.state_dict()['param_groups'][0]['lr'])
running_loss = .0
seq_len = 20
beam_size = 5
# kwargs = {
# 'text_flag': args.text2text
# }
with tqdm(desc='Epoch %d - train' % e, unit='it', total=len(dataloader)) as pbar:
for it, (detections, caps_gt, captions) in enumerate(dataloader):
detections = detections.to(device)
# text = captions.to(device)
# kwargs['text'] = text
outs, log_probs = model(mode='rl', images=detections, max_len=seq_len, eos_idx=text_field.vocab.stoi['<eos>'], beam_size=beam_size, out_size=beam_size)
optim.zero_grad()
# Rewards
caps_gen = text_field.decode(outs.view(-1, seq_len))
caps_gt = list(itertools.chain(*([c, ] * beam_size for c in caps_gt)))
caps_gen, caps_gt = tokenizer_pool.map(evaluation.PTBTokenizer.tokenize, [caps_gen, caps_gt])
reward = cider.compute_score(caps_gt, caps_gen)[1].astype(np.float32)
reward = torch.from_numpy(reward).to(device).view(detections.shape[0], beam_size)
reward_baseline = torch.mean(reward, -1, keepdim=True)
loss = -torch.mean(log_probs, -1) * (reward - reward_baseline)
loss = loss.mean()
loss.backward()
optim.step()
running_loss += loss.item()
running_reward += reward.mean().item()
running_reward_baseline += reward_baseline.mean().item()
pbar.set_postfix(loss=running_loss / (it + 1), reward=running_reward / (it + 1), reward_baseline=running_reward_baseline / (it + 1))
pbar.update()
loss = running_loss / len(dataloader)
reward = running_reward / len(dataloader)
reward_baseline = running_reward_baseline / len(dataloader)
return loss, reward, reward_baseline
把
mode
,
detections
、
max_seq_len
、
beam_size
等等喂给
model()
,进入到
Transformer.forward()
方法;判断是
rl
模式,初始化
BeamSearch
,进入
apply
方法
elif mode == 'rl':
bs = BeamSearch(self, max_len, eos_idx, beam_size)
return bs.apply(images, out_size, return_probs)
后面就是 $Beam\ Search$ 算法,每次选择 $TopK$ 的 $sequence$,最后组成的一句话就是最终的输出 $Caption$
model()
返回
outs
和
log_prob
,前者就是
token
的
index
;生成对应的
sequence
,与
caps_gt
计算
reward
和
reward_baseline
,然后计算 $loss$,反向传播更新参数
一个 $epoch$ 结束后,计算 $Validation\ Loss$ 与各指标
# Validation loss
val_loss = evaluate_loss(model, dataloader_val, loss, text_field, e)
writer.add_scalar('data/val_loss', val_loss, e)
# Validation scores
scores = evaluate_metrics(model, dict_dataloader_val, text_field, e)
val_cider = scores['CIDEr']
print("Validation scores", scores)
writer.add_scalar('data/val_cider', val_cider, e)
writer.add_scalar('data/val_bleu1', scores['BLEU'][0], e)
writer.add_scalar('data/val_bleu4', scores['BLEU'][3], e)
writer.add_scalar('data/val_meteor', scores['METEOR'], e)
writer.add_scalar('data/val_rouge', scores['ROUGE'], e)
还有 $Test\ Scores$
# Test scores
scores = evaluate_metrics(model, dict_dataloader_test, text_field, e)
test_cider = scores['CIDEr']
print("Test scores", scores)
writer.add_scalar('data/test_cider', test_cider, e)
writer.add_scalar('data/test_bleu1', scores['BLEU'][0], e)
writer.add_scalar('data/test_bleu4', scores['BLEU'][3], e)
writer.add_scalar('data/test_meteor', scores['METEOR'], e)
writer.add_scalar('data/test_rouge', scores['ROUGE'], e)
为下一个 $epoch$ 的设置
# Prepare for next epoch
best = False
if val_cider >= best_cider:
best_cider = val_cider
patience = 0
best = True
else:
patience += 1
best_test = False
if test_cider >= best_test_cider:
best_test_cider = test_cider
best_test = True
switch_to_rl = False
exit_train = False
设置 $XE$ 训练阶段的结束, $SCST$ 训练阶段的开始和整个训练阶段的结束
if patience == 5:
# xe stage train 15 epoches at least
if e < args.xe_least:
print('special treatment, e = {}'.format(e))
use_rl = False
switch_to_rl = False
patience = 0
elif not use_rl:
use_rl = True
switch_to_rl = True
patience = 0
optim_rl = Adam(model.parameters(), lr=1, betas=(0.9, 0.98))
scheduler_rl = LambdaLR(optim_rl, lambda_lr_rl)
for k in range(e-1):
scheduler_rl.step()
print("Switching to RL")
else:
print('patience reached.')
exit_train = True
if e == args.xe_most:
# xe stage no more than 20 epoches
if not use_rl:
use_rl = True
switch_to_rl = True
patience = 0
optim_rl = Adam(model.parameters(), lr=1, betas=(0.9, 0.98))
scheduler_rl = LambdaLR(optim_rl, lambda_lr_rl)
for k in range(e-1):
scheduler_rl.step()
print("Switching to RL")
为转到 $SCST$ 继续训练设置随机种子和模型参数
if switch_to_rl and not best:
data = torch.load(os.path.join(args.dir_to_save_model, '%s_best.pth' % args.exp_name))
torch.set_rng_state(data['torch_rng_state'])
torch.cuda.set_rng_state(data['cuda_rng_state'])
np.random.set_state(data['numpy_rng_state'])
random.setstate(data['random_rng_state'])
model.load_state_dict(data['state_dict'])
print('Resuming from epoch %d, validation loss %f, best_cider %f, and best test_cider %f' % (data['epoch'], data['val_loss'], data['best_cider'], data['best_test_cider']))
保存该 $epoch$ 的参数
torch.save({
'torch_rng_state': torch.get_rng_state(),
'cuda_rng_state': torch.cuda.get_rng_state(),
'numpy_rng_state': np.random.get_state(),
'random_rng_state': random.getstate(),
'epoch': e,
'val_loss': val_loss,
'val_cider': val_cider,
'state_dict': model.state_dict(),
'optimizer': optim.state_dict() if not use_rl else optim_rl.state_dict(),
'scheduler': scheduler.state_dict() if not use_rl else scheduler_rl.state_dict(),
'patience': patience,
'best_cider': best_cider,
'best_test_cider': best_test_cider,
'use_rl': use_rl,
}, os.path.join(args.dir_to_save_model, '%s_last.pth' % args.exp_name))
保存模型,$Validation$ 和 $Test$ 的最终模型和最优模型都保存下来
if best:
copyfile(os.path.join(args.dir_to_save_model, '%s_last.pth' % args.exp_name), os.path.join(args.dir_to_save_model, '%s_best.pth' % args.exp_name))
if best_test:
copyfile(os.path.join(args.dir_to_save_model, '%s_last.pth' % args.exp_name), os.path.join(args.dir_to_save_model, '%s_best_test.pth' % args.exp_name))
训练结束,退出
if exit_train:
writer.close()
break
参考