传统神经网络,内存需求大,运算量大。无法在移动设备以及嵌入式设备上运行。Mobilenet专注于移动端或者嵌入式设备中的轻量级CNN网络(相比VGG16准确率下降0.9%,但模型参数只有VGG1/32)
深度可分离卷积,Depthwise Convolution
增加超参数α(卷积核个数),β(卷积核大小)
深度可分离卷积图形
深度可分离实现
激活函数:relu
存在的问题,dw卷积核参数大部分为0
相比V1网络模型更小,准确率更高
Inverted Residuals(倒残差结构) 先升维再降维
Linear Bottlenecks(倒残差最有一层使用linear激活函数)
激活函数Relu6,DW
倒残差结构最后一层使用线性激活,使用Relu容易丢失低纬度信息
倒残差结构
网络结构
t:扩展因子,倒残差结构第一层1x1卷积层的扩展倍率
c:输出通道数
n:bottleneck重复次数
s:步距,第一层,其它为1
性能对比
1)分类
2)目标检测
相比V2网络更快更高效,增加3.2%的准确率的同时减少20%的延时。
1)更新block,加入SE模块,更新了激活函数
SE理解:先进行平均池化->relu->h-sig->将因子乘上特征图
2)使用NAS搜索参数
3)重新设计耗时层结构
减少第一个卷积层的卷积核个数(32->16)
卷积核从32减少到16,精度相同,时间减少2毫秒,运算量减少2百万。
精简Last Stage
在精度没有减少的情况下,时间减少7毫秒(占用整个推理时间11%),运算量减少3千万Add
重新设计了激活函数
对推理速度和量化过程都比较友好
网络结构 MobileNetV3-Large
MobilenetV3 Small
Lage与Small网络对比
MobileNetV2
from torch import nn import torch def _make_divisible(ch, divisor=8, min_ch=None): """ This function is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by 8 It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py """ if min_ch is None: min_ch = divisor new_ch = max(min_ch, int(ch + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_ch < 0.9 * ch: new_ch += divisor return new_ch class ConvBNReLU(nn.Sequential): def __init__(self, in_channel, out_channel, kernel_size=3, stride=1, groups=1): padding = (kernel_size - 1) // 2 super(ConvBNReLU, self).__init__( nn.Conv2d(in_channel, out_channel, kernel_size, stride, padding, groups=groups, bias=False), nn.BatchNorm2d(out_channel), nn.ReLU6(inplace=True) ) class InvertedResidual(nn.Module): def __init__(self, in_channel, out_channel, stride, expand_ratio): super(InvertedResidual, self).__init__() hidden_channel = in_channel * expand_ratio self.use_shortcut = stride == 1 and in_channel == out_channel layers = [] if expand_ratio != 1: # 1x1 pointwise conv layers.append(ConvBNReLU(in_channel, hidden_channel, kernel_size=1)) layers.extend([ # 3x3 depthwise conv ConvBNReLU(hidden_channel, hidden_channel, stride=stride, groups=hidden_channel), # 1x1 pointwise conv(linear) nn.Conv2d(hidden_channel, out_channel, kernel_size=1, bias=False), nn.BatchNorm2d(out_channel), ]) self.conv = nn.Sequential(*layers) def forward(self, x): if self.use_shortcut: return x + self.conv(x) else: return self.conv(x) class MobileNetV2(nn.Module): def __init__(self, num_classes=1000, alpha=1.0, round_nearest=8): super(MobileNetV2, self).__init__() block = InvertedResidual input_channel = _make_divisible(32 * alpha, round_nearest) last_channel = _make_divisible(1280 * alpha, round_nearest) inverted_residual_setting = [ # t, c, n, s [1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], [6, 320, 1, 1], ] features = [] # conv1 layer features.append(ConvBNReLU(3, input_channel, stride=2))# # building inverted residual residual blockes for t, c, n, s in inverted_residual_setting: output_channel = _make_divisible(c * alpha, round_nearest) for i in range(n): stride = s if i == 0 else 1 features.append(block(input_channel, output_channel, stride, expand_ratio=t)) input_channel = output_channel # building last several layers features.append(ConvBNReLU(input_channel, last_channel, 1)) # combine feature layers self.features = nn.Sequential(*features) # building classifier self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.classifier = nn.Sequential( nn.Dropout(0.2), nn.Linear(last_channel, num_classes)#全连接 论文是卷积 ) # weight initialization for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, nn.BatchNorm2d): nn.init.ones_(m.weight) nn.init.zeros_(m.bias) elif isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 0.01) nn.init.zeros_(m.bias) def forward(self, x): x = self.features(x) x = self.avgpool(x) x = torch.flatten(x, 1)# x = self.classifier(x) return x
MobileNet V3
from typing import Callable, List, Optional import torch from torch import nn, Tensor from torch.nn import functional as F from functools import partial def _make_divisible(ch, divisor=8, min_ch=None): """ This function is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by 8 It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py """ if min_ch is None: min_ch = divisor new_ch = max(min_ch, int(ch + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_ch < 0.9 * ch: new_ch += divisor return new_ch class ConvBNActivation(nn.Sequential): def __init__(self, in_planes: int, out_planes: int, kernel_size: int = 3, stride: int = 1, groups: int = 1, norm_layer: Optional[Callable[..., nn.Module]] = None, activation_layer: Optional[Callable[..., nn.Module]] = None): padding = (kernel_size - 1) // 2 if norm_layer is None: norm_layer = nn.BatchNorm2d if activation_layer is None: activation_layer = nn.ReLU6 super(ConvBNActivation, self).__init__(nn.Conv2d(in_channels=in_planes, out_channels=out_planes, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False), norm_layer(out_planes), activation_layer(inplace=True)) class SqueezeExcitation(nn.Module): def __init__(self, input_c: int, squeeze_factor: int = 4): super(SqueezeExcitation, self).__init__() squeeze_c = _make_divisible(input_c // squeeze_factor, 8) self.fc1 = nn.Conv2d(input_c, squeeze_c, 1) self.fc2 = nn.Conv2d(squeeze_c, input_c, 1) def forward(self, x: Tensor) -> Tensor: scale = F.adaptive_avg_pool2d(x, output_size=(1, 1)) scale = self.fc1(scale) scale = F.relu(scale, inplace=True) scale = self.fc2(scale) scale = F.hardsigmoid(scale, inplace=True) return scale * x class InvertedResidualConfig: def __init__(self, input_c: int, kernel: int, expanded_c: int, out_c: int, use_se: bool, activation: str, stride: int, width_multi: float): self.input_c = self.adjust_channels(input_c, width_multi) self.kernel = kernel self.expanded_c = self.adjust_channels(expanded_c, width_multi) self.out_c = self.adjust_channels(out_c, width_multi) self.use_se = use_se self.use_hs = activation == "HS" # whether using h-swish activation self.stride = stride @staticmethod def adjust_channels(channels: int, width_multi: float): return _make_divisible(channels * width_multi, 8) class InvertedResidual(nn.Module): def __init__(self, cnf: InvertedResidualConfig, norm_layer: Callable[..., nn.Module]): super(InvertedResidual, self).__init__() if cnf.stride not in [1, 2]: raise ValueError("illegal stride value.") self.use_res_connect = (cnf.stride == 1 and cnf.input_c == cnf.out_c) layers: List[nn.Module] = [] activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU # expand if cnf.expanded_c != cnf.input_c: layers.append(ConvBNActivation(cnf.input_c, cnf.expanded_c, kernel_size=1, norm_layer=norm_layer, activation_layer=activation_layer)) # depthwise layers.append(ConvBNActivation(cnf.expanded_c, cnf.expanded_c, kernel_size=cnf.kernel, stride=cnf.stride, groups=cnf.expanded_c, norm_layer=norm_layer, activation_layer=activation_layer)) if cnf.use_se: layers.append(SqueezeExcitation(cnf.expanded_c)) # project layers.append(ConvBNActivation(cnf.expanded_c, cnf.out_c, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.Identity)) self.block = nn.Sequential(*layers) self.out_channels = cnf.out_c self.is_strided = cnf.stride > 1 def forward(self, x: Tensor) -> Tensor: result = self.block(x) if self.use_res_connect: result += x return result class MobileNetV3(nn.Module): def __init__(self, inverted_residual_setting: List[InvertedResidualConfig], last_channel: int, num_classes: int = 1000, block: Optional[Callable[..., nn.Module]] = None, norm_layer: Optional[Callable[..., nn.Module]] = None): super(MobileNetV3, self).__init__() if not inverted_residual_setting: raise ValueError("The inverted_residual_setting should not be empty.") elif not (isinstance(inverted_residual_setting, List) and all([isinstance(s, InvertedResidualConfig) for s in inverted_residual_setting])): raise TypeError("The inverted_residual_setting should be List[InvertedResidualConfig]") if block is None: block = InvertedResidual if norm_layer is None: norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.01) layers: List[nn.Module] = [] # building first layer firstconv_output_c = inverted_residual_setting[0].input_c layers.append(ConvBNActivation(3, firstconv_output_c, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=nn.Hardswish)) # building inverted residual blocks for cnf in inverted_residual_setting: layers.append(block(cnf, norm_layer)) # building last several layers lastconv_input_c = inverted_residual_setting[-1].out_c lastconv_output_c = 6 * lastconv_input_c layers.append(ConvBNActivation(lastconv_input_c, lastconv_output_c, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.Hardswish)) self.features = nn.Sequential(*layers) self.avgpool = nn.AdaptiveAvgPool2d(1) self.classifier = nn.Sequential(nn.Linear(lastconv_output_c, last_channel), nn.Hardswish(inplace=True), nn.Dropout(p=0.2, inplace=True), nn.Linear(last_channel, num_classes)) # initial weights for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode="fan_out") if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.ones_(m.weight) nn.init.zeros_(m.bias) elif isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 0.01) nn.init.zeros_(m.bias) def _forward_impl(self, x: Tensor) -> Tensor: x = self.features(x) x = self.avgpool(x) x = torch.flatten(x, 1) x = self.classifier(x) return x def forward(self, x: Tensor) -> Tensor: return self._forward_impl(x) def mobilenet_v3_large(num_classes: int = 1000, reduced_tail: bool = False) -> MobileNetV3: """ Constructs a large MobileNetV3 architecture from "Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>. weights_link: https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth Args: num_classes (int): number of classes reduced_tail (bool): If True, reduces the channel counts of all feature layers between C4 and C5 by 2. It is used to reduce the channel redundancy in the backbone for Detection and Segmentation. """ width_multi = 1.0 bneck_conf = partial(InvertedResidualConfig, width_multi=width_multi) adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_multi=width_multi) reduce_divider = 2 if reduced_tail else 1 inverted_residual_setting = [ # input_c, kernel, expanded_c, out_c, use_se, activation, stride bneck_conf(16, 3, 16, 16, False, "RE", 1), bneck_conf(16, 3, 64, 24, False, "RE", 2), # C1 bneck_conf(24, 3, 72, 24, False, "RE", 1), bneck_conf(24, 5, 72, 40, True, "RE", 2), # C2 bneck_conf(40, 5, 120, 40, True, "RE", 1), bneck_conf(40, 5, 120, 40, True, "RE", 1), bneck_conf(40, 3, 240, 80, False, "HS", 2), # C3 bneck_conf(80, 3, 200, 80, False, "HS", 1), bneck_conf(80, 3, 184, 80, False, "HS", 1), bneck_conf(80, 3, 184, 80, False, "HS", 1), bneck_conf(80, 3, 480, 112, True, "HS", 1), bneck_conf(112, 3, 672, 112, True, "HS", 1), bneck_conf(112, 5, 672, 160 // reduce_divider, True, "HS", 2), # C4 bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1), bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1), ] last_channel = adjust_channels(1280 // reduce_divider) # C5 return MobileNetV3(inverted_residual_setting=inverted_residual_setting, last_channel=last_channel, num_classes=num_classes) def mobilenet_v3_small(num_classes: int = 1000, reduced_tail: bool = False) -> MobileNetV3: """ Constructs a large MobileNetV3 architecture from "Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>. weights_link: https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth Args: num_classes (int): number of classes reduced_tail (bool): If True, reduces the channel counts of all feature layers between C4 and C5 by 2. It is used to reduce the channel redundancy in the backbone for Detection and Segmentation. """ width_multi = 1.0 bneck_conf = partial(InvertedResidualConfig, width_multi=width_multi) adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_multi=width_multi) reduce_divider = 2 if reduced_tail else 1 inverted_residual_setting = [ # input_c, kernel, expanded_c, out_c, use_se, activation, stride bneck_conf(16, 3, 16, 16, True, "RE", 2), # C1 bneck_conf(16, 3, 72, 24, False, "RE", 2), # C2 bneck_conf(24, 3, 88, 24, False, "RE", 1), bneck_conf(24, 5, 96, 40, True, "HS", 2), # C3 bneck_conf(40, 5, 240, 40, True, "HS", 1), bneck_conf(40, 5, 240, 40, True, "HS", 1), bneck_conf(40, 5, 120, 48, True, "HS", 1), bneck_conf(48, 5, 144, 48, True, "HS", 1), bneck_conf(48, 5, 288, 96 // reduce_divider, True, "HS", 2), # C4 bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1), bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1) ] last_channel = adjust_channels(1024 // reduce_divider) # C5 return MobileNetV3(inverted_residual_setting=inverted_residual_setting, last_channel=last_channel, num_classes=num_classes)