- 本质思想:利用Softmax加权通道维度信息,再与原特征图相乘得到结果特征图
- 代表性方法:SENet(注意模型SEAttention)
SEAttention(Squeeze-and-Excitation Attention),整体网络结构如下图所示:
- 参考论文:
- Squeeze:原始feature map的维度为HxWxC,Squeeze做的事情是把HxWxC压缩为1x1xC,相当于把HxW压缩成一维,实际中一般是用global average pooling实现的。HxW压缩成一维后,相当于这一维参数获得了之前HxW全局的视野,感受区域更广
- Excitation:得到Squeeze的1x1xC的表示后,加入一个全连接层(Fully Connected),对每个通道的重要性进行预测,得到不同channel的重要性大小后再激励到之前的feature map的对应channel上,再进行后续操作,具体实现如下代码所示
import numpy as np
import torch
from torch import nn
from torch.nn import init
# 此代码放于models/
class SEAttention(nn.Module):
def __init__(self, channel=512,reduction=16):
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(channel, channel // reduction, bias=False),
nn.Linear(channel // reduction, channel, bias=False),
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant_(m.weight, 1)
init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
init.normal_(m.weight, std=0.001)
if m.bias is not None:
init.constant_(m.bias, 0)
def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x).view(b, c)
y = self.fc(y).view(b, c, 1, 1)
return x * y.expand_as(x)
# YOLOv5 by YOLOAir, GPL-3.0 license
# Parameters
nc: 80 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
- [10,13, 16,30, 33,23] # P3/8
- [30,61, 62,45, 59,119] # P4/16
- [116,90, 156,198, 373,326] # P5/32
# YOLOv5 v6.0 backbone
# [from, number, module, args]
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 9
# YOLOv5 v6.0 head
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 13
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 17 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 20 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]],
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 23 (P5/32-large)
[-1, 1, SEAttention, [1024]],
[[17, 20, 24], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
elif m in [SEAttention]:
c1, c2 = ch[f], args[0]
if c2 != no: # if not output
c2 = make_divisible(c2 * gw, 8)
args = [c1, *args[1:]]
- 空间注意力模型适合放于head中,最好是在预测层前
- 在YOLOv5中加入SEAttention,有助于平稳训练过程,可避免出现验证集的损失先下降再上升的情形
- 本质思想:关注于特征图空间维度(宽高)信息,局部空间注意力和全局空间注意力
- 代表性方法:S2Attention
- 首先对输入特征图进行一个全连接,其实也就是1×1卷积,只不过这里将维度变为了原来的三倍,然后经过一个 GELU 激活函数
- 然后特征图被均分为 三等份,分别用于后续三个 Spatial-shift 分支的输入;第一个分支Spatial-shift 操作,即右-左-下-上移动,第二个分支进行与第一个分支反对称的 Spatial-shift 操作,即下-上-右-左移动,第三个分支保持不变
- 最后将三个分支的结果通过 Split Attention 结合起来。这样不同位置的信息就被加到同一个通道上对齐了,再经过一个 MLP 进行不同位置的信息整合,然后经过 LN 激活函数
- 参考论文:
import numpy as np
import torch
from torch import nn
from torch.nn import init
# 此代码放于models/
def spatial_shift1(x):
b,w,h,c = x.size()
x[:,1:,:,:c//4] = x[:,:w-1,:,:c//4]
x[:,:w-1,:,c//4:c//2] = x[:,1:,:,c//4:c//2]
x[:,:,1:,c//2:c*3//4] = x[:,:,:h-1,c//2:c*3//4]
x[:,:,:h-1,3*c//4:] = x[:,:,1:,3*c//4:]
return x
def spatial_shift2(x):
b,w,h,c = x.size()
x[:,:,1:,:c//4] = x[:,:,:h-1,:c//4]
x[:,:,:h-1,c//4:c//2] = x[:,:,1:,c//4:c//2]
x[:,1:,:,c//2:c*3//4] = x[:,:w-1,:,c//2:c*3//4]
x[:,:w-1,:,3*c//4:] = x[:,1:,:,3*c//4:]
return x
class SplitAttention(nn.Module):
def __init__(self, channel=512, k=3):
def forward(self,x_all):
b, k, h, w, c = x_all.shape
x_all = x_all.reshape(b, k, -1, c) # bs,k,n,c
a = torch.sum(torch.sum(x_all, 1), 1) # bs,c
hat_a = self.mlp2(self.gelu(self.mlp1(a))) # bs,kc
hat_a = hat_a.reshape(b, self.k, c) # bs,k,c
bar_a = self.softmax(hat_a) # bs,k,c
attention = bar_a.unsqueeze(-2) # bs,k,1,c
out = attention*x_all # bs,k,n,c
out = torch.sum(out, 1).reshape(b, h, w, c)
return out
class S2Attention(nn.Module):
def __init__(self, channels=512, out_channel=1024):
self.mlp1 = nn.Linear(channels, channels*3)
self.mlp2 = nn.Linear(channels, channels)
self.split_attention = SplitAttention()
def forward(self, x):
b, c, w, h = x.size()
x=x.permute(0, 2, 3, 1)
x = self.mlp1(x)
x1 = spatial_shift1(x[:,:,:,:c])
x2 = spatial_shift2(x[:,:,:,c:c*2])
x3 = x[:,:,:,c*2:]
x_all = torch.stack([x1,x2,x3],1)
a = self.split_attention(x_all)
x = self.mlp2(a)
return x
# parameters
nc: 3 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
# anchors
#- [5,6, 7,9, 12,10] # P2/4
# - [10,13, 16,30, 33,23] # P3/8
# - [30,61, 62,45, 59,119] # P4/16
# - [116,90, 156,198, 373,326] # P5/32
- [11,10, 17,16, 25,24] # P3/8
- [38,37, 46,72, 88,67] # P4/16
- [75,129, 180,145, 283,348] # P5/32
# YOLOv5 backbone
# [from, number, module, args]
[ [-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 9
# YOLOv5 v6.0 head
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]],
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 20 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 23 (P4/16-medium) [256, 256, 1, False]
# [-1, 3, CBAM, [512]], #26
[-1, 1, Conv, [512, 3, 2]], # 24 [256, 256, 3, 2]
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 26 (P5/32-large) [512, 512, 1, False]
[-1, 1, S2Attention, [1024]],
[[17, 20, 24], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
- nc:检测的类别数
- depth_multiple:控制每层代码块的个数
- width_multiple:控制每层特征图的深度
- 参数量:YOLOv5s_S2Attention summary: 278 layers, 9126920 parameters, 9126920 gradients, 17.6 GFLOPs
- 这个配置空间注意力放于输出预测层之后,空间注意力放于输出预测层之前效果会更好,推荐在实际应用中空间注意力放于三层输出预测层之前
elif m in [S2Attention]:
c1, c2 = ch[f], args[0]
if c2 != no: # if not output
c2 = make_divisible(c2 * gw, 8)
args = [c1, *args[1:]]
- 本质思想:串联或并联结合通道注意力和空间注意力
- 代表性方法:CBAM、BAM、SKAttention
通道注意力+空间注意力模块CBAM: Convolutional Block Attention Module,整体结构如下图所示:
- 通道注意力模块(CAM):通道维度不变,压缩空间维度,该模块加权输入特征图的维度信息
- 空间注意力模块(SAM):空间维度不变,压缩通道维度,该模块关注于目标的位置信息
- 参考论文:
import torch
import torch.nn as nn
import torch.nn.functional as F
# 此代码放于models/
class ChannelAttentionModule(nn.Module):
def __init__(self, c1, reduction=16):
super(ChannelAttentionModule, self).__init__()
mid_channel = c1 // reduction
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.max_pool = nn.AdaptiveMaxPool2d(1)
self.shared_MLP = nn.Sequential(
nn.Linear(in_features=c1, out_features=mid_channel),
nn.LeakyReLU(0.1, inplace=True),
nn.Linear(in_features=mid_channel, out_features=c1)
self.act = nn.Sigmoid()
# self.act=nn.SiLU()
def forward(self, x):
avgout = self.shared_MLP(self.avg_pool(x).view(x.size(0), -1)).unsqueeze(2).unsqueeze(3)
maxout = self.shared_MLP(self.max_pool(x).view(x.size(0), -1)).unsqueeze(2).unsqueeze(3)
return self.act(avgout + maxout)
class SpatialAttentionModule(nn.Module):
def __init__(self):
super(SpatialAttentionModule, self).__init__()
self.conv2d = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=7, stride=1, padding=3)
self.act = nn.Sigmoid()
def forward(self, x):
avgout = torch.mean(x, dim=1, keepdim=True)
maxout, _ = torch.max(x, dim=1, keepdim=True)
out =[avgout, maxout], dim=1)
out = self.act(self.conv2d(out))
return out
class CBAM(nn.Module):
def __init__(self, c1, c2):
super(CBAM, self).__init__()
self.channel_attention = ChannelAttentionModule(c1)
self.spatial_attention = SpatialAttentionModule()
def forward(self, x):
out = self.channel_attention(x) * x
out = self.spatial_attention(out) * out
return out
# YOLOv5 by YOLOAir, GPL-3.0 license
# Parameters
nc:3 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
# anchors
#- [5,6, 7,9, 12,10] # P2/4
- [10,13, 16,30, 33,23] # P3/8
- [30,61, 62,45, 59,119] # P4/16
- [116,90, 156,198, 373,326] # P5/32
# YOLOv5 backbone
# [from, number, module, args]
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [c=3,64*0.5=32,3]
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, CBAM, [1024]], #9
[-1, 1, SPPF, [1024,5]], #10
# YOLOv5 head
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 14
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 18 (P3/8-small)
[-1, 1, CBAM, [256]], #19
[-1, 1, Conv, [256, 3, 2]],
[[-1, 15], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 22 (P4/16-medium) [256, 256, 1, False]
[-1, 1, CBAM, [512]],
[-1, 1, Conv, [512, 3, 2]], #[256, 256, 3, 2]
[[-1, 11], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 25 (P5/32-large) [512, 512, 1, False]
[-1, 1, CBAM, [1024]],
[[19, 23, 27], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
- nc:检测的类别数
- depth_multiple:控制每层代码块的个数
- width_multiple:控制每层特征图的深度
- 在配置文件中,Backbone和Head中均包含CBAM模块,在训练时加入CBAM训练更稳定,因此一般推荐在Backbone中SPPF前面添加以及在输出预测层前添加
elif m in [CBAM]:
c1, c2 = ch[f], args[0]
if c2 != no: # if not output
c2 = make_divisible(c2 * gw, 8)
args = [c1, *args[1:]]
选择性卷积核网络SKNet(Selective Kernel Networks)SKAttention,整体结构如下图所示:
- 在传统的CNN中每一个卷积层都是用相同大小的卷积核,限制了模型的表达能力,像Inception这种宽度模型结构也验证了使用多个不同的卷积核进行学习确实可以提升模型的表达能力
- SKAttention借鉴了SENet的思想,通过动态计算每个卷积核得到通道的权重,动态的将各个卷积核的结果进行融合
- 模型分为三个部分:Split、Fuse和Select
- Split就是一个multi-branch的操作,用不同的卷积核进行卷积得到不同的特征;Fuse部分就是用SENet的结构获取通道注意力的矩阵(N个卷积核就可以得到N个注意力矩阵,这步操作对所有的特征参数共享),这样就可以得到不同kernel经过SENet之后的特征;Select操作就是将得到的这几个特征进行相加
- 参考论文:
import numpy as np
import torch
from torch import nn
from torch.nn import init
from collections import OrderedDict
# 此代码放于models/
class SKAttention(nn.Module):
def __init__(self, channel=512, out_channel=1024, kernels=[1,3,5,7], reduction=16, group=1, L=32):
for k in kernels:
nn.Sequential(OrderedDict([ ('conv',nn.Conv2d(channel,channel,kernel_size=k,padding=k//2,groups=group)),
for i in range(len(kernels)):
def forward(self, x):
bs, c, _, _ = x.size()
### split
for conv in self.convs:
feats=torch.stack(conv_outs, 0)#k,bs,channel,h,w
### fuse
U=sum(conv_outs) #bs,c,h,w
### reduction channel
S=U.mean(-1).mean(-1) #bs,c
Z=self.fc(S) #bs,d
### calculate attention weight
for fc in self.fcs:
weights.append(weight.view(bs, c, 1, 1)) #bs,channel
attention_weughts=torch.stack(weights, 0)#k,bs,channel,1,1
### fuse
return V
# parameters
nc: 3 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
# anchors
#- [5,6, 7,9, 12,10] # P2/4
# - [10,13, 16,30, 33,23] # P3/8
# - [30,61, 62,45, 59,119] # P4/16
# - [116,90, 156,198, 373,326] # P5/32
- [11,10, 17,16, 25,24] # P3/8
- [38,37, 46,72, 88,67] # P4/16
- [75,129, 180,145, 283,348] # P5/32
# YOLOv5 backbone
# [from, number, module, args]
[ [-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 9
# YOLOv5 v6.0 head
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]],
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 20 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 23 (P4/16-medium) [256, 256, 1, False]
# [-1, 3, CBAM, [512]], #26
[-1, 1, Conv, [512, 3, 2]], # 24 [256, 256, 3, 2]
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 26 (P5/32-large) [512, 512, 1, False]
[-1, 1, SKAttention, [1024]],
[[17, 20, 24], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
- nc:检测的类别数
- depth_multiple:控制每层代码块的个数
- width_multiple:控制每层特征图的深度
- 参数量:YOLOv5s_SKAttention summary: 295 layers, 29137960 parameters, 29137960 gradients, 33.7 GFLOPs
- SKAttention层整体计算量多,建议仅在最后一层输出预测层前添加
- 缺点:训练稳定性差,最终精度不如其他注意力模型
elif m in [SKAttention]:
c1, c2 = ch[f], args[0]
if c2 != no: # if not output
c2 = make_divisible(c2 * gw, 8)
args = [c1, *args[1:]]
本文选取全局注意力机制GAM(Global Attention Mechanism),整体结构如下图所示:
- 通道注意力子模块:通道注意子模块使用三维排列来在第三个维度(即通道维度)上保留信息,然后用一个两层的MLP(多层感知器)放大跨维通道-空间依赖性(MLP是一种编码-解码器结构,与BAM相同,其压缩比为r),通道注意子模块如下图所示
- 空间注意力子模块:在空间注意力子模块中,为了关注空间信息,使用两个卷积层进行空间信息融合,还从通道注意力子模块中使用了与BAM相同的缩减比r;与此同时由于最大池化操作减少了信息的使用,产生了消极的影响,这里删除了池化操作以进一步保留特性映射,因此空间注意力模块有时会显著增加参数的数量;为了防止参数显著增加,在ResNet50中采用带Channel Shuffle的Group卷积,无Group卷积的空间注意力子模块如下图所示
- 参考论文:
import numpy as np
import torch
from torch import nn
from torch.nn import init
# 此代码放于models/
class GAM_Attention(nn.Module):
def __init__(self, c1, c2, group=True, rate=4):
super(GAM_Attention, self).__init__()
self.channel_attention = nn.Sequential(
nn.Linear(c1, int(c1 / rate)),
nn.Linear(int(c1 / rate), c1)
self.spatial_attention = nn.Sequential(
nn.Conv2d(c1, c1 // rate, kernel_size=7, padding=3, groups=rate) if group else nn.Conv2d(c1, int(c1 / rate),
nn.BatchNorm2d(int(c1 / rate)),
nn.Conv2d(c1 // rate, c2, kernel_size=7, padding=3, groups=rate) if group else nn.Conv2d(int(c1 / rate), c2,
def forward(self, x):
b, c, h, w = x.shape
x_permute = x.permute(0, 2, 3, 1).view(b, -1, c)
x_att_permute = self.channel_attention(x_permute).view(b, h, w, c)
x_channel_att = x_att_permute.permute(0, 3, 1, 2)
# x_channel_att=channel_shuffle(x_channel_att,4) #last shuffle
x = x * x_channel_att
x_spatial_att = self.spatial_attention(x).sigmoid()
x_spatial_att = channel_shuffle(x_spatial_att, 4) # last shuffle
out = x * x_spatial_att
# out=channel_shuffle(out,4) #last shuffle
return out
# parameters
nc: 3 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
# anchors
#- [5,6, 7,9, 12,10] # P2/4
- [10,13, 16,30, 33,23] # P3/8
- [30,61, 62,45, 59,119] # P4/16
- [116,90, 156,198, 373,326] # P5/32
# YOLOv5 backbone
# [from, number, module, args] # [c=channels,module,kernlsize,strides]
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [c=3,64*0.5=32,3]
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, GAM_Attention, [512,512]], #9
[-1, 1, SPPF, [1024,5]], #10
# YOLOv5 head
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 14
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 18 (P3/8-small)
[-1, 1, GAM_Attention, [128,128]], #19
[-1, 1, Conv, [256, 3, 2]],
[[-1, 15], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 22 (P4/16-medium) [256, 256, 1, False]
[-1, 1, GAM_Attention, [256,256]],
[-1, 1, Conv, [512, 3, 2]], #[256, 256, 3, 2]
[[-1, 11], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 25 (P5/32-large) [512, 512, 1, False]
[-1, 1, GAM_Attention, [512,512]], #
[[19, 23, 27], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
- nc:检测的类别数
- depth_multiple:控制每层代码块的个数
- width_multiple:控制每层特征图的深度
- 参数量:YOLOv5s_gam summary: 314 layers, 11069767 parameters, 11069767 gradients, 21.6 GFLOPs
- 在Backbone中倒数第二层和Head中输出预测层前添加GAM_Attention
- 建议:GAM_Attention计算量适中,可多放于Backbone和Head三层预测输出层前
elif m in [GAM_Attention]:
c1, c2 = ch[f], args[0]
if c2 != no: # if not output
c2 = make_divisible(c2 * gw, 8)
args = [c1, *args[1:]]
- 整体结构与SKAttention类似,多了分组卷积和租归一化
- 参考论文:
import numpy as np
import torch
from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
# 此代码放于models/
class ShuffleAttention(nn.Module):
def __init__(self, channel=512,reduction=16,G=8):
self.avg_pool = nn.AdaptiveAvgPool2d(1) = nn.GroupNorm(channel // (2 * G), channel // (2 * G))
self.cweight = Parameter(torch.zeros(1, channel // (2 * G), 1, 1))
self.cbias = Parameter(torch.ones(1, channel // (2 * G), 1, 1))
self.sweight = Parameter(torch.zeros(1, channel // (2 * G), 1, 1))
self.sbias = Parameter(torch.ones(1, channel // (2 * G), 1, 1))
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant_(m.weight, 1)
init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
init.normal_(m.weight, std=0.001)
if m.bias is not None:
init.constant_(m.bias, 0)
def channel_shuffle(x, groups):
b, c, h, w = x.shape
x = x.reshape(b, groups, -1, h, w)
x = x.permute(0, 2, 1, 3, 4)
# flatten
x = x.reshape(b, -1, h, w)
return x
def forward(self, x):
b, c, h, w = x.size()
#group into subfeatures
x=x.view(b*self.G,-1,h,w) #bs*G,c//G,h,w
x_0,x_1=x.chunk(2,dim=1) #bs*G,c//(2*G),h,w
#channel attention
x_channel=self.avg_pool(x_0) #bs*G,c//(2*G),1,1
x_channel=self.cweight*x_channel+self.cbias #bs*G,c//(2*G),1,1
#spatial attention #bs*G,c//(2*G),h,w
x_spatial=self.sweight*x_spatial+self.sbias #bs*G,c//(2*G),h,w
x_spatial=x_1*self.sigmoid(x_spatial) #bs*G,c//(2*G),h,w
# concatenate along channel axis[x_channel,x_spatial],dim=1) #bs*G,c//G,h,w
# channel shuffle
out = self.channel_shuffle(out, 2)
return out
# YOLOv5 by Ultralytics, GPL-3.0 license
# Parameters
nc: 3 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
- [10,13, 16,30, 33,23] # P3/8
- [30,61, 62,45, 59,119] # P4/16
- [116,90, 156,198, 373,326] # P5/32
# YOLOv5 v6.0 backbone
# [from, number, module, args]
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 9
# YOLOAir v6.0 head
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 13
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 17 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 20 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]],
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 23 (P5/32-large)
[-1, 1, ShuffleAttention, [1024]], # 修改
[[17, 20, 24], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
elif m is ShuffleAttention:
c1, c2 = ch[f], args[0]
if c2 != no:
c2 = make_divisible(c2 * gw, 8)
非深度网络Non-Deep Network(ParNet, Parallel Network),整体结构如下图所示:
- 12层的深度网络在ImageNet上达到了80.7%的top-1精度(Non-Deep Network很符合)
- ParNet由处理不同分辨率特征的并行子结构组成,这些并行的子结构称为流,来自不同流的特征在网络的后期阶段进行融合,这些融合的特征用于网络后面输出层
- 图(a)是ParNet网络结构,图(b)是ParNet Block,也称作为 RepVGG-SSE Block
- ParNet Block在训练阶段由三个并行分支组成:1×1 卷积、3×3 卷积和SSE(Skip-Squeeze-and-Excitation);在推理阶段,1×1 卷积和3×3 卷积融合在一起
- 在ParNet Block中,使用SiLU替换ReLU激活函数,SSE中包含通道注意力SENet
- ParNet Network由Downsampling、RepVGG SSE block、Fusion 和 Avg Pool以及FC组成,其中Downsampling和Fusion如下图所示
- 参考论文:
import numpy as np
import torch
from torch import nn
from torch.nn import init
# 此代码放于models/
class ParNetAttention(nn.Module):
def __init__(self, channel=512, out_channel=512):
self.sse = nn.Sequential(
nn.Conv2d(channel, out_channel, kernel_size=1),
nn.Conv2d(channel, out_channel, kernel_size=1),
nn.Conv2d(channel, out_channel, kernel_size=3, padding=1),
def forward(self, x):
b, c, _, _ = x.size()
x1 = self.conv1x1(x)
x2 = self.conv3x3(x)
x3 = self.sse(x) * x
y = self.silu(x1 + x2 + x3)
return y
# parameters
nc: 3 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
# anchors
#- [5,6, 7,9, 12,10] # P2/4
# - [10,13, 16,30, 33,23] # P3/8
# - [30,61, 62,45, 59,119] # P4/16
# - [116,90, 156,198, 373,326] # P5/32
- [11,10, 17,16, 25,24] # P3/8
- [38,37, 46,72, 88,67] # P4/16
- [75,129, 180,145, 283,348] # P5/32
# YOLOv5 backbone
# [from, number, module, args]
[ [-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 9
# YOLOv5 v6.0 head
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]],
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 20 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 23 (P4/16-medium) [256, 256, 1, False]
[-1, 1, Conv, [512, 3, 2]], #24 #[256, 256, 3, 2]
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 26 (P5/32-large) [512, 512, 1, False]
[-1, 1, ParNetAttention, [1024]],
[[17, 20, 24], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
- nc:检测的类别数
- depth_multiple:控制每层代码块的个数
- width_multiple:控制每层特征图的深度
- 此配置仅在Head最后一层输出预测层之后添加ParNetAttention
- 建议:在实际应用如果有轻量化需求,在Backbone和Head中可用ParNet取代C3模块
elif m in [ParNetAttention]:
c1, c2 = ch[f], args[0]
if c2 != no: # if not output
c2 = make_divisible(c2 * gw, 8)
args = [c1, *args[1:]]
CnvNeXt Block的整体结构图如下图所示:
- 在激活函数选择上,使用GELU取代ReLU
- 使用更少的激活函数,在最终的检测准确率提高了1%以上
- 使用更少的归一化层,将BN换成LN
- 单独的下采样层:在ResNet网络中stage2-stage4的下采样都是通过将主分支上3x3的卷积层步距设置成2,捷径分支上1x1的卷积层步距设置成2进行下采样的;ConvNext单独使用了一个下采样层,就是通过一个Laryer Normalization加上一个卷积核大小为2步距为2的卷积层构成
- 参考论文:
# 此代码放于models/
class LayerNorm_s(nn.Module):
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
self.weight = nn.Parameter(torch.ones(normalized_shape))
self.bias = nn.Parameter(torch.zeros(normalized_shape))
self.eps = eps
self.data_format = data_format
if self.data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError
self.normalized_shape = (normalized_shape,)
def forward(self, x):
if self.data_format == "channels_last":
return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
elif self.data_format == "channels_first":
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
class ConvNextBlock(nn.Module):
def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
self.norm = LayerNorm_s(dim, eps=1e-6)
self.pwconv1 = nn.Linear(dim, 4 * dim)
self.act = nn.GELU()
self.pwconv2 = nn.Linear(4 * dim, dim)
self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
requires_grad=True) if layer_scale_init_value > 0 else None
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
def forward(self, x):
input = x
x = self.dwconv(x)
x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.pwconv2(x)
if self.gamma is not None:
x = self.gamma * x
x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
x = input + self.drop_path(x)
return x
class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path_f(x, self.drop_prob,
def drop_path_f(x, drop_prob: float = 0., training: bool = False):
if drop_prob == 0. or not training:
return x
keep_prob = 1 - drop_prob
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
random_tensor.floor_() # binarize
output = x.div(keep_prob) * random_tensor
return output
class CNeB(nn.Module):
# CSP ConvNextBlock with 3 convolutions,类似于YOLOv5中的C3结构
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
c_ = int(c2 * e) # hidden channels
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c1, c_, 1, 1)
self.cv3 = Conv(2 * c_, c2, 1)
self.m = nn.Sequential(*(ConvNextBlock(c_) for _ in range(n)))
def forward(self, x):
return self.cv3(, self.cv2(x)), dim=1))
# YOLOAir by , GPL-3.0 license
# Parameters
nc: 3 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
- [10,13, 16,30, 33,23] # P3/8
- [30,61, 62,45, 59,119] # P4/16
- [116,90, 156,198, 373,326] # P5/32
# YOLOv5 v6.0 backbone
# [from, number, module, args]
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 9
# YOLOv5 v6.0 head
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, CNeB, [512]], # 13
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, CNeB, [256]], # 17 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, CNeB, [512]], # 20 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]],
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, CNeB, [1024]], # 23 (P5/32-large)
[[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
- nc:检测的类别数
- depth_multiple:控制每层代码块的个数
- width_multiple:控制每层特征图的深度
- 建议:ConvNextBlock可用于取代C3(建议只改变Backbone中的C3),CNeB模块可用于调整模型输出预测层前特征分布
elif m is ConvNextBlock:
c1, c2 = ch[f], args[0]
if c2 != no:
c2 = make_divisible(c2 * gw, 8)
args = [c1, c2, *args[1:]]
if m is ConvNextBlock:
args.insert(2, n)
n = 1
# 此文件放于utils/metrics
def bbox_alpha_iou(box1, box2, x1y1x2y2=False, GIoU=False, DIoU=False, CIoU=False, alpha=2, eps=1e-9):
# Returns tsqrt_he IoU of box1 to box2. box1 is 4, box2 is nx4
box2 = box2.T
# Get the coordinates of bounding boxes
if x1y1x2y2: # x1, y1, x2, y2 = box1
b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
else: # transform from xywh to xyxy
b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
# Intersection area
inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
(torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)
# Union Area
w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
union = w1 * h1 + w2 * h2 - inter + eps
# change iou into pow(iou+eps)
# iou = inter / union
iou = torch.pow(inter/union + eps, alpha)
# beta = 2 * alpha
if GIoU or DIoU or CIoU:
cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1) # convex (smallest enclosing box) width
ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1) # convex height
if CIoU or DIoU: # Distance or Complete IoU
c2 = (cw ** 2 + ch ** 2) ** alpha + eps # convex diagonal
rho_x = torch.abs(b2_x1 + b2_x2 - b1_x1 - b1_x2)
rho_y = torch.abs(b2_y1 + b2_y2 - b1_y1 - b1_y2)
rho2 = ((rho_x ** 2 + rho_y ** 2) / 4) ** alpha # center distance
if DIoU:
return iou - rho2 / c2 # DIoU
elif CIoU: #
v = (4 / math.pi ** 2) * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
with torch.no_grad():
alpha_ciou = v / ((1 + eps) - inter / union + v)
# return iou - (rho2 / c2 + v * alpha_ciou) # CIoU
return iou - (rho2 / c2 + torch.pow(v * alpha_ciou + eps, alpha)) # CIoU
else: # GIoU
# c_area = cw * ch + eps # convex area
# return iou - (c_area - union) / c_area # GIoU
c_area = torch.max(cw * ch + eps, union) # convex area
return iou - torch.pow((c_area - union) / c_area + eps, alpha) # GIoU
return iou # torch.log(iou+eps) or iou
def bbox_iou(box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, EIoU=False, SIoU=False, eps=1e-7):
# Returns the IoU of box1 to box2. box1 is 4, box2 is nx4
box2 = box2.T
# Get the coordinates of bounding boxes
if x1y1x2y2: # x1, y1, x2, y2 = box1
b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
else: # transform from xywh to xyxy
b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
# Intersection area
inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
(torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)
# Union Area
w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
union = w1 * h1 + w2 * h2 - inter + eps
iou = inter / union
if CIoU or DIoU or GIoU or EIoU or SIoU:
cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1) # convex (smallest enclosing box) width
ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1) # convex height
if CIoU or DIoU or EIoU or SIoU: # Distance or Complete IoU
c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared
rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 +
(b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center distance squared
if DIoU: #DIoU
return iou - rho2 / c2 # DIoU
elif CIoU: #CIoU
v = (4 / math.pi ** 2) * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
with torch.no_grad():
alpha = v / (v - iou + (1 + eps))
return iou - (rho2 / c2 + v * alpha) # CIoU
elif SIoU:# SIoU
s_cw = (b2_x1 + b2_x2 - b1_x1 - b1_x2) * 0.5
s_ch = (b2_y1 + b2_y2 - b1_y1 - b1_y2) * 0.5
sigma = torch.pow(s_cw ** 2 + s_ch ** 2, 0.5)
sin_alpha_1 = torch.abs(s_cw) / sigma
sin_alpha_2 = torch.abs(s_ch) / sigma
threshold = pow(2, 0.5) / 2
sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1)
angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2)
rho_x = (s_cw / cw) ** 2
rho_y = (s_ch / ch) ** 2
gamma = angle_cost - 2
distance_cost = 2 - torch.exp(gamma * rho_x) - torch.exp(gamma * rho_y)
omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2)
omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2)
shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), 4) + torch.pow(1 - torch.exp(-1 * omiga_h), 4)
return iou - 0.5 * (distance_cost + shape_cost)
else:# EIoU
w_dis=torch.pow(b1_x2-b1_x1-b2_x2+b2_x1, 2)
h_dis=torch.pow(b1_y2-b1_y1-b2_y2+b2_y1, 2)
cw2=torch.pow(cw , 2)+eps
ch2=torch.pow(ch , 2)+eps
return iou-(rho2/c2+w_dis/cw2+h_dis/ch2)
c_area = cw * ch + eps # convex area
return iou - (c_area - union) / c_area # GIoU
return iou # IoU
- 首先在utils/loss.py导入方法:from utils.metrics import bbox_iou, bbox_alpha_iou
- 注释官方代码:iou = bbox_iou(pbox, tbox[i], CIoU=True).squeeze()
iou = bbox_iou(pbox.T, tbox[i], x1y1x2y2=False, DIoU=True)
iou = bbox_iou(pbox.T, tbox[i], x1y1x2y2=False, GIoU=True)
iou = bbox_iou(pbox.T, tbox[i], x1y1x2y2=False, GIoU=True)
iou = bbox_iou(pbox.T, tbox[i], x1y1x2y2=False, GIoU=True)
alpha_ciou:iou = bbox_alpha_iou(pbox.T, tbox[i], x1y1x2y2=False, alpha=3, CIoU=True)
from utils.metrics import bbox_iou
# 在utils/general.py中添加
def soft_nms(prediction, conf_thres=0.25, iou_thres=0.45, multi_label=False):
"""Runs Non-Maximum Suppression (NMS) on inference results
list of detections, on (n,6) tensor per image [xyxy, conf, cls]
nc = prediction.shape[2] - 5 # number of classes
# Checks
assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
# Settings
min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height
time_limit = 10.0 # seconds to quit after
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
soft_nms = True
t = time.time()
output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
for xi, x in enumerate(prediction): # image index, image inference
x = x[x[:, 4] > conf_thres] # confidence
x = x[(x[:, 2:4] > min_wh).all(1) & (x[:, 2:4] < max_wh).all(1)]
if len(x) == 0:
# Compute conf
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
box = xywh2xyxy(x[:, :4])
# Detections matrix nx6 (xyxy, conf, cls)
if multi_label:
i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
x =[i], x[i, j + 5].unsqueeze(1), j.float().unsqueeze(1)), 1)
else: # best class only
conf, j = x[:, 5:].max(1)
x =, conf.unsqueeze(1), j.float().unsqueeze(1)), 1)[conf.view(-1) > conf_thres]
if len(x) == 0:
x = x[x[:, 4].argsort(descending=True)] # sort by confidence
# Batched NMS
det_max = []
cls = x[:, -1] # classes
for c in cls.unique():
dc = x[cls == c]
n = len(dc)
if n == 1:
elif n > 30000:
dc = dc[:30000]
if soft_nms:
sigma = 0.5
while len(dc):
if len(dc) == 1:
iou = bbox_iou(dc[0], dc[1:]) # 修改
dc = dc[1:]
dc[:, 4] *= torch.exp(-iou ** 2 / sigma)
dc = dc[dc[:, 4] > conf_thres]
if len(det_max):
det_max =
#output[xi] = det_max[(-det_max[:, 4]).argsort()]
output[xi] = det_max[(-det_max[:, 4]).argsort()]
if (time.time() - t) > time_limit:
print(f'WARNING: NMS time limit {time_limit}s exceeded')
break # time limit exceeded
return output
在val.py将 out = non_max_suppression(out, conf_thres, iou_thres, labels=lb, multi_label=True, agnostic=single_cls) 替换为 out = soft_nms(out, conf_thres, iou_thres, multi_label=True)
在 def soft_nms(prediction, conf_thres=0.25, iou_thres=0.45, multi_label=False)函数中找到 iou = bbox_iou(dc[0], dc[1:]),将其替换为iou = bbox_iou(dc[0], dc[1:], CIoU=True)即可
在val.py将 out = non_max_suppression(out, conf_thres, iou_thres, labels=lb, multi_label=True, agnostic=single_cls) 替换为 out = soft_nms(out, conf_thres, iou_thres, multi_label=True)
在def soft_nms(prediction, conf_thres=0.25, iou_thres=0.45, multi_label=False)函数中找到 iou = bbox_iou(dc[0], dc[1:]),将其替换为iou = bbox_iou(dc[0], dc[1:], DIoU=True)即可
在val.py将 out = non_max_suppression(out, conf_thres, iou_thres, labels=lb, multi_label=True, agnostic=single_cls) 替换为 out = soft_nms(out, conf_thres, iou_thres, multi_label=True)
在def soft_nms(prediction, conf_thres=0.25, iou_thres=0.45, multi_label=False)函数中找到 iou = bbox_iou(dc[0], dc[1:]),将其替换为iou = bbox_iou(dc[0], dc[1:], EIoU=True)即可
在val.py将 out = non_max_suppression(out, conf_thres, iou_thres, labels=lb, multi_label=True, agnostic=single_cls) 替换为 out = soft_nms(out, conf_thres, iou_thres, multi_label=True)
在def soft_nms(prediction, conf_thres=0.25, iou_thres=0.45, multi_label=False)函数中找到 iou = bbox_iou(dc[0], dc[1:]),将其替换为iou = bbox_iou(dc[0], dc[1:], SIoU=True)即可
在val.py将 out = non_max_suppression(out, conf_thres, iou_thres, labels=lb, multi_label=True, agnostic=single_cls) 替换为 out = soft_nms(out, conf_thres, iou_thres, multi_label=True)
在def soft_nms(prediction, conf_thres=0.25, iou_thres=0.45, multi_label=False)函数中找到 iou = bbox_iou(dc[0], dc[1:]),将其替换为iou = bbox_iou(dc[0], dc[1:], GIoU=True)即可使用说明:
在val.py将 out = non_max_suppression(out, conf_thres, iou_thres, labels=lb, multi_label=True, agnostic=single_cls) 替换为 out = soft_nms(out, conf_thres, iou_thres, multi_label=True)
# 此文件放于utils/general.py文件
def NMS(boxes, scores, iou_thres, class_nms='CIoU'):
# class_nms=class_nms
if class_nms == 'CIoU':
elif class_nms == 'DIoU':
elif class_nms == 'GIoU':
elif class_nms == 'EIoU':
else :
B = torch.argsort(scores, dim=-1, descending=True)
keep = []
while B.numel() > 0:
index = B[0]
if B.numel() == 1: break
iou = bbox_iou(boxes[index, :], boxes[B[1:], :], GIoU=GIoU, DIoU=DIoU, CIoU=CIoU, EIoU=EIoU, SIoU=SIoU)
inds = torch.nonzero(iou <= iou_thres).reshape(-1)
B = B[inds + 1]
return torch.tensor(keep)
其次将non_max_suppression(utils/general.py文件)方法中的i = torchvision.ops.nms(boxes, scores, iou_thres)注释,改为i = NMS(boxes, scores, iou_thres, class_nms='xxx')
i = NMS(boxes, scores, iou_thres, class_nms='DIoU')
i = NMS(boxes, scores, iou_thres, class_nms='GIoU')
i = NMS(boxes, scores, iou_thres, class_nms='CIoU')
i = NMS(boxes, scores, iou_thres, class_nms='EIoU')
i = NMS(boxes, scores, iou_thres, class_nms='SIoU')