ModelCompression | Bamboo Traces

模型压缩通用方法

模型压缩是指将原本的大网络模型通过一些技术手段，压缩成为具有更好的实时性或参数量更小的模型。常见的模型压缩技术包括网络剪枝、神经架构搜索、知识蒸馏和量化。

网络剪枝（network pruning）是指去掉网络模型中不必要的参数。网络剪枝的一般步骤是：训练一个大网络、评估每个参数的重要性、去掉不重要的参数以及微调去掉参数后的网络以恢复剪枝损失的部分精度。剪枝可以利用大模型本身容易训练到较高精度的优势，以最小的精度损失代价来获得更小的模型。

如何确定要保留什么结构以及修剪哪些结构？

1.可以修剪绝对值（或幅度）最小的权重。（属于非结构化剪枝，无法加速稀疏矩阵计算）

2.根据过滤器的范数（L1或者L2）对过滤器进行排序（？）

3.在要修剪的每组图层之后为每个特征图插入一个可学习的乘法参数，当参数减少到0时，有效修剪了负责这个通道的整套参数，这个参数的大小说明了所有参数的重要性。

4.在小批量训练数据上累积梯度，并根据该梯度与每个参数的相应权重之间的乘积进行修剪。

知识蒸馏（knowledge distillation）利用大型教师模型网络参数包含的知识监督小型学生模型，使其能够在一定程度上拟合大的教师模型的输出，从而提高学生模型的精度，以得到更高精度的紧凑小模型。

量化（quantization）是指通过一定技术手段降低模型的数字精度以达到压缩模型、加快推理速度的效果，是模型部署常用的技术之一。

接下来使用一个简单的CNN模型为例学习

剪枝原理：

剪枝基于权重的重要性进行，例中使用了L1范数作为重要性度量

将不重要的权重设置为0，从而减少模型计算量

剪枝后的模型需要微调来恢复性能

import torch
import torch.nn as nn
import torch.nn.utils.prune as prune

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(32 * 8 * 8, 10)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(2, 2)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(-1, 32 * 8 * 8)
        x = self.fc1(x)
        return x

def count_parameters(model):
    total_params = 0
    zero_params = 0
    for name, module in model.named_modules():
        if isinstance(module, (nn.Conv2d, nn.Linear)):
            # 检查是否已经被剪枝
            if hasattr(module, 'weight_mask'):
                total_params += module.weight_mask.numel()
                zero_params += torch.sum(module.weight_mask == 0).item()
            else:
                total_params += module.weight.numel()
                zero_params += torch.sum(module.weight == 0).item()
    return total_params, zero_params

def apply_pruning(model, pruning_ratio):
    for name, module in model.named_modules():
        if isinstance(module, (nn.Conv2d, nn.Linear)):
            prune.l1_unstructured(
                module,
                name='weight',
                amount=pruning_ratio
            )
            # 打印每层的剪枝统计
            mask = module.weight_mask
            total = mask.numel()
            zeros = torch.sum(mask == 0).item()
            print(f"Layer {name}:")
            print(f"总参数: {total}")
            print(f"被剪枝参数: {zeros}")
            print(f"层压缩率: {zeros/total*100:.2f}%\n")

# 测试代码
model = SimpleCNN()

print("剪枝前参数统计：")
total, zeros = count_parameters(model)
print(f"总参数数量: {total}")
print(f"零参数数量: {zeros}")
print(f"初始压缩率: {zeros/total*100:.2f}%\n")

# 应用90%的剪枝率
print("应用50%剪枝...")
apply_pruning(model, 0.5)

print("\n剪枝后总体参数统计：")
total, zeros = count_parameters(model)
print(f"总参数数量: {total}")
print(f"零参数数量: {zeros}")
print(f"最终压缩率: {zeros/total*100:.2f}%")

量化是将模型的权重从32位浮点数转换为低位数值（如8位整数）的过程。

可参考代码：https://github.com/BastianChen/Model-Compression-Demo/tree/master/quantization

知识蒸馏这是一种将大模型（教师模型）的知识转移到小模型（学生模型）的方法

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader


# 教师模型（与之前的SimpleCNN相同）
class TeacherCNN(nn.Module):
    def __init__(self):
        super(TeacherCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(32 * 8 * 8, 10)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(2, 2)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(-1, 32 * 8 * 8)
        x = self.fc1(x)
        return x


# 学生模型（更小的网络）
class StudentCNN(nn.Module):
    def __init__(self):
        super(StudentCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(16 * 8 * 8, 10)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(2, 2)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(-1, 16 * 8 * 8)
        x = self.fc1(x)
        return x


class DistillationLoss(nn.Module):
    def __init__(self, temperature=3.0):
        super(DistillationLoss, self).__init__()
        self.temperature = temperature

    def forward(self, student_outputs, teacher_outputs, labels, alpha=0.5):
        # 软目标损失
        soft_targets = F.softmax(teacher_outputs / self.temperature, dim=1)
        soft_prob = F.log_softmax(student_outputs / self.temperature, dim=1)
        soft_targets_loss = -torch.sum(soft_targets * soft_prob) * (self.temperature ** 2)

        # 硬目标损失
        hard_loss = F.cross_entropy(student_outputs, labels)

        # 组合损失
        loss = (alpha * soft_targets_loss) + ((1 - alpha) * hard_loss)
        return loss


def load_cifar10(batch_size=128):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                            download=True, transform=transform)
    trainloader = DataLoader(trainset, batch_size=batch_size,
                             shuffle=True, num_workers=2)

    testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                           download=True, transform=transform)
    testloader = DataLoader(testset, batch_size=batch_size,
                            shuffle=False, num_workers=2)

    return trainloader, testloader


def evaluate_model(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, labels in dataloader:
            data, labels = data.to(device), labels.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total


def train_with_distillation(teacher_model, student_model, train_loader, test_loader,
                            device, epochs=10, temperature=3.0, alpha=0.5):
    teacher_model.to(device)
    student_model.to(device)

    teacher_model.eval()
    student_model.train()

    optimizer = torch.optim.Adam(student_model.parameters())
    distillation_criterion = DistillationLoss(temperature=temperature)

    best_acc = 0.0

    for epoch in range(epochs):
        running_loss = 0.0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)

            optimizer.zero_grad()

            # 获取教师模型的输出
            with torch.no_grad():
                teacher_output = teacher_model(data)

            # 获取学生模型的输出
            student_output = student_model(data)

            # 计算蒸馏损失
            loss = distillation_criterion(student_output, teacher_output,
                                          target, alpha=alpha)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            if batch_idx % 100 == 99:
                print(f'Epoch: {epoch + 1}, Batch: {batch_idx + 1}, '
                      f'Loss: {running_loss / 100:.4f}')
                running_loss = 0.0

        # 评估模型
        student_acc = evaluate_model(student_model, test_loader, device)
        teacher_acc = evaluate_model(teacher_model, test_loader, device)
        print(f'\nEpoch {epoch + 1}:')
        print(f'Teacher Accuracy: {teacher_acc:.2f}%')
        print(f'Student Accuracy: {student_acc:.2f}%')

        if student_acc > best_acc:
            best_acc = student_acc
            torch.save(student_model.state_dict(), 'best_student_model.pth')

    return student_model


# 使用示例
if __name__ == '__main__':
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 加载数据
    train_loader, test_loader = load_cifar10()

    # 创建模型
    teacher_model = TeacherCNN()
    student_model = StudentCNN()

    # 训练教师模型（这里假设已经训练好了）
    # 实际使用时需要先训练教师模型

    # 进行知识蒸馏
    student_model = train_with_distillation(
        teacher_model=teacher_model,
        student_model=student_model,
        train_loader=train_loader,
        test_loader=test_loader,
        device=device,
        epochs=10,
        temperature=3.0,
        alpha=0.5
    )


    # 比较模型大小
    def count_parameters(model):
        return sum(p.numel() for p in model.parameters())


    teacher_params = count_parameters(teacher_model)
    student_params = count_parameters(student_model)

    print("\n模型大小比较：")
    print(f"教师模型参数数量: {teacher_params:,}")
    print(f"学生模型参数数量: {student_params:,}")
    print(f"压缩率: {(1 - student_params / teacher_params) * 100:.2f}% ")

优缺点：

剪枝

优点：实现简单，可以显著减少模型大小
缺点：可能需要反复尝试以找到最佳剪枝比例

量化

优点：显著减少模型存储空间和推理时间
缺点：可能导致精度轻微下降

知识蒸馏

优点：可以得到更小但性能相近的模型
缺点：需要训练过程，实现相对复杂

\4. NAS神经架构搜索

优点：

自动化设计，减少人工干预。
可能发现人类难以设计的高性能架构。

缺点：

计算成本高，需要大量计算资源。
搜索过程耗时。

神经架构搜索（neural architecture search， NAS）是一种利用强化学习方法同时学习模型架构和相应参数的方法。简单来说，就是在一个定义好的搜索空间内，通过一定的搜索策略，得到最终表现最好的网络。通过加入准确率、推理延迟等指标，网络架构搜索产生的网络结构在轻量化应用中能获得更高的竞争力。

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# 定义搜索空间
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size):
        super(ConvBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, padding=kernel_size//2)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(2)

    def forward(self, x):
        x = self.conv(x)
        x = self.relu(x)
        x = self.pool(x)
        return x

# 定义候选架构
def create_model(kernel_size, channels):
    model = nn.Sequential(
        ConvBlock(1, channels[0], kernel_size),
        ConvBlock(channels[0], channels[1], kernel_size),
        nn.Flatten(),
        nn.Linear(channels[1] * 7 * 7, 10)  # 假设输入是 28x28
    )
    return model

# 加载数据
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# 随机搜索策略
def random_search(search_space, num_trials=10):
    best_accuracy = 0
    best_model = None

    for _ in range(num_trials):
        # 随机选择超参数
        kernel_size = search_space['kernel_sizes'][torch.randint(0, len(search_space['kernel_sizes']), (1,)).item()]
        channels = [search_space['channels'][torch.randint(0, len(search_space['channels']), (1,)).item() for _ in range(2)]

        # 创建模型
        model = create_model(kernel_size, channels)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.CrossEntropyLoss()

        # 训练模型
        for epoch in range(2):  # 简单训练 2 个 epoch
            for inputs, labels in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

        # 评估模型
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in train_loader:
                outputs = model(inputs)
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = correct / total

        # 更新最佳模型
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model

    return best_model, best_accuracy

# 定义搜索空间
search_space = {
    'kernel_sizes': [3, 5],
    'channels': [16, 32, 64]
}

# 执行随机搜索
best_model, best_accuracy = random_search(search_space)
print(f'Best Accuracy: {best_accuracy:.4f}')
print(best_model)