🚀 计算机视觉实战项目¶

⚠️ 时效性说明：本章涉及前沿模型/价格/榜单等信息，可能随版本快速变化；请以论文原文、官方发布页和 API 文档为准。

3个完整项目：图像分类 → 目标检测 → 多模态检索，覆盖训练、评估、部署全流程，附完整可运行代码。

📋 项目总览¶

项目	核心技术	难度	预计耗时
P1: 图像分类系统	ResNet/数据增强/AMP/ONNX	⭐⭐⭐	2天
P2: 目标检测系统	YOLOv8微调/自定义数据	⭐⭐⭐⭐	3天
P3: 多模态检索系统	CLIP/向量数据库/图文检索	⭐⭐⭐⭐⭐	3天

🖼️ P1: 图像分类系统¶

1.1 项目简介¶

构建一个生产级图像分类系统，使用ResNet在自定义数据集上微调，包含完整的数据增强、混合精度训练、模型导出和推理部署流程。

技术栈：PyTorch、torchvision、ONNX Runtime、Albumentations

1.2 数据准备与增强¶

Python

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2
import numpy as np
import os

# ===================== 数据增强 =====================
train_transform = A.Compose([
    A.RandomResizedCrop(224, 224, scale=(0.8, 1.0)),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.2),
    A.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1, p=0.8),
    A.OneOf([
        A.GaussianBlur(blur_limit=7),
        A.GaussNoise(var_limit=(10, 50)),
        A.MotionBlur(blur_limit=7),
    ], p=0.3),
    A.CoarseDropout(max_holes=8, max_height=32, max_width=32, p=0.3),  # Cutout
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

val_transform = A.Compose([
    A.Resize(256, 256),
    A.CenterCrop(224, 224),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

# ===================== 数据集 =====================
class ImageClassificationDataset(Dataset):
    """自定义图像分类数据集
    目录结构:
        data/
            train/
                class_0/  img1.jpg, img2.jpg, ...
                class_1/  img1.jpg, img2.jpg, ...
            val/
                class_0/  ...
                class_1/  ...
    """
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = sorted(os.listdir(root_dir))
        self.class_to_idx = {c: i for i, c in enumerate(self.classes)}  # enumerate同时获取索引和元素

        self.samples = []
        for cls_name in self.classes:
            cls_dir = os.path.join(root_dir, cls_name)
            for fname in os.listdir(cls_dir):
                if fname.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
                    self.samples.append((os.path.join(cls_dir, fname), self.class_to_idx[cls_name]))

    def __len__(self):  # __len__定义len()行为
        return len(self.samples)

    def __getitem__(self, idx):  # __getitem__定义索引访问行为
        img_path, label = self.samples[idx]
        image = np.array(Image.open(img_path).convert("RGB"))  # np.array创建NumPy数组
        if self.transform:
            image = self.transform(image=image)["image"]
        return image, label

# 创建数据加载器
train_dataset = ImageClassificationDataset("data/train", transform=train_transform)
val_dataset = ImageClassificationDataset("data/val", transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)  # DataLoader批量加载数据
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

print(f"训练集: {len(train_dataset)} 张, {len(train_dataset.classes)} 类")
print(f"验证集: {len(val_dataset)} 张")

1.3 模型构建与训练¶

Python

from torch.amp import autocast, GradScaler
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from sklearn.metrics import classification_report, confusion_matrix
import time

# ===================== 模型定义 =====================
class ImageClassifier(nn.Module):  # 继承nn.Module定义网络层
    def __init__(self, num_classes, pretrained=True):
        super().__init__()  # super()调用父类方法
        self.backbone = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2 if pretrained else None)
        in_features = self.backbone.fc.in_features
        self.backbone.fc = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(in_features, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        return self.backbone(x)

# ===================== 训练循环 =====================
def train_one_epoch(model, loader, criterion, optimizer, scaler, device):
    model.train()  # train()训练模式
    total_loss, correct, total = 0, 0, 0

    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)  # 移至GPU/CPU
        optimizer.zero_grad()  # 清零梯度

        with autocast('cuda'):  # 混合精度
            outputs = model(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()  # 反向传播计算梯度
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item() * images.size(0)  # 将单元素张量转为Python数值
        _, preds = outputs.max(1)
        correct += preds.eq(labels).sum().item()
        total += labels.size(0)

    return total_loss / total, correct / total

@torch.no_grad()  # 禁用梯度计算，节省内存
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    all_preds, all_labels = [], []

    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        with autocast('cuda'):
            outputs = model(images)
            loss = criterion(outputs, labels)

        total_loss += loss.item() * images.size(0)
        _, preds = outputs.max(1)
        correct += preds.eq(labels).sum().item()
        total += labels.size(0)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    return total_loss / total, correct / total, all_preds, all_labels

# ===================== 主训练流程 =====================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = len(train_dataset.classes)
model = ImageClassifier(num_classes).to(device)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)
scaler = GradScaler()

best_acc = 0
num_epochs = 30

for epoch in range(num_epochs):
    start = time.time()
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, scaler, device)
    val_loss, val_acc, preds, labels = evaluate(model, val_loader, criterion, device)
    scheduler.step()  # 更新参数

    elapsed = time.time() - start
    print(f"Epoch {epoch+1}/{num_epochs} ({elapsed:.1f}s) | "
          f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "best_model.pth")
        print(f"  ✅ 保存最佳模型, Acc={best_acc:.4f}")

# 最终评估
model.load_state_dict(torch.load("best_model.pth", weights_only=True))
_, _, preds, labels = evaluate(model, val_loader, criterion, device)
print("\n" + classification_report(labels, preds, target_names=train_dataset.classes))

1.4 ONNX导出与推理部署¶

Python

import onnxruntime as ort

# ===================== ONNX导出 =====================
model.eval()
dummy_input = torch.randn(1, 3, 224, 224).to(device)

torch.onnx.export(
    model, dummy_input, "classifier.onnx",
    input_names=["image"],
    output_names=["logits"],
    dynamic_axes={"image": {0: "batch"}, "logits": {0: "batch"}},
    opset_version=17
)
print("✅ ONNX模型已导出")

# ===================== ONNX推理 =====================
class ONNXClassifier:
    def __init__(self, model_path, class_names):
        self.session = ort.InferenceSession(
            model_path,
            providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
        )
        self.class_names = class_names
        self.transform = val_transform

    def predict(self, image_path):
        image = np.array(Image.open(image_path).convert("RGB"))
        input_tensor = self.transform(image=image)["image"].unsqueeze(0).numpy()  # unsqueeze增加一个维度

        outputs = self.session.run(None, {"image": input_tensor})
        probs = self._softmax(outputs[0][0])
        top_idx = probs.argmax()

        return {
            "class": self.class_names[top_idx],
            "confidence": float(probs[top_idx]),
            "top5": [(self.class_names[i], float(probs[i]))
                     for i in probs.argsort()[::-1][:5]]  # 切片操作，取前n个元素
        }

    @staticmethod  # @staticmethod不需要实例即可调用
    def _softmax(x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

# 使用示例
classifier = ONNXClassifier("classifier.onnx", train_dataset.classes)
result = classifier.predict("test_image.jpg")
print(f"预测: {result['class']} ({result['confidence']:.2%})")
for cls, prob in result['top5']:
    print(f"  {cls}: {prob:.2%}")

🔍 P2: 目标检测系统¶

2.1 项目简介¶

使用YOLOv8在自定义数据集上微调目标检测模型，覆盖数据标注格式、训练配置、模型评估和推理部署全流程。

技术栈：Ultralytics YOLOv8、OpenCV、COCO格式

2.2 数据集准备¶

Python

import yaml
import shutil
from pathlib import Path

# ===================== YOLO数据集目录结构 =====================
"""
dataset/
├── images/
│   ├── train/   # 训练图像
│   └── val/     # 验证图像
├── labels/
│   ├── train/   # 训练标签 (YOLO格式txt)
│   └── val/     # 验证标签
└── data.yaml    # 数据集配置
"""

# ===================== 数据集配置文件 =====================
data_config = {
    'path': './dataset',
    'train': 'images/train',
    'val': 'images/val',
    'names': {
        0: 'person',
        1: 'car',
        2: 'bicycle',
        3: 'dog',
        4: 'cat',
    }
}

with open('dataset/data.yaml', 'w') as f:  # with自动管理文件关闭
    yaml.dump(data_config, f, default_flow_style=False)

# ===================== COCO → YOLO 格式转换 =====================
import json

def coco_to_yolo(coco_json_path, output_dir, image_dir):
    """将COCO格式标注转换为YOLO格式"""
    with open(coco_json_path) as f:
        coco = json.load(f)

    # 图像信息映射
    img_map = {img['id']: img for img in coco['images']}

    # 类别ID映射 (COCO ID → 连续ID)
    cat_map = {cat['id']: i for i, cat in enumerate(coco['categories'])}

    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # 按图像分组标注
    from collections import defaultdict
    ann_by_img = defaultdict(list)  # defaultdict访问不存在的键时返回默认值
    for ann in coco['annotations']:
        ann_by_img[ann['image_id']].append(ann)

    for img_id, img_info in img_map.items():
        w, h = img_info['width'], img_info['height']
        label_file = Path(output_dir) / (Path(img_info['file_name']).stem + '.txt')

        lines = []
        for ann in ann_by_img.get(img_id, []):
            cls_id = cat_map[ann['category_id']]
            x, y, bw, bh = ann['bbox']  # COCO: (x_min, y_min, width, height)
            # 转YOLO: (cx, cy, w, h) 归一化
            cx = (x + bw / 2) / w
            cy = (y + bh / 2) / h
            bw /= w
            bh /= h
            lines.append(f"{cls_id} {cx:.6f} {cy:.6f} {bw:.6f} {bh:.6f}")

        label_file.write_text('\n'.join(lines))

    print(f"✅ 转换完成: {len(img_map)} 张图像的标签已保存到 {output_dir}")

# 使用示例
coco_to_yolo("annotations/instances_train.json", "dataset/labels/train", "dataset/images/train")

2.3 YOLOv8 训练¶

Python

from ultralytics import YOLO

# ===================== 模型训练 =====================
# 加载预训练模型
model = YOLO('yolov8m.pt')  # 中等大小，平衡精度与速度

# 训练配置
results = model.train(
    data='dataset/data.yaml',
    epochs=100,
    imgsz=640,
    batch=16,
    device=0,
    # 优化器配置
    optimizer='AdamW',
    lr0=0.001,
    lrf=0.01,         # 最终学习率 = lr0 * lrf
    weight_decay=0.0005,
    warmup_epochs=3,
    # 数据增强
    mosaic=1.0,        # Mosaic增强
    mixup=0.1,         # MixUp增强
    copy_paste=0.1,    # Copy-Paste增强
    hsv_h=0.015,       # 色调
    hsv_s=0.7,         # 饱和度
    hsv_v=0.4,         # 亮度
    degrees=10.0,      # 旋转
    translate=0.1,     # 平移
    scale=0.5,         # 缩放
    fliplr=0.5,        # 水平翻转
    # 保存配置
    project='runs/detect',
    name='custom_yolov8m',
    save=True,
    save_period=10,    # 每10epoch保存一次
    patience=20,       # 早停
)

print(f"✅ 训练完成! 最佳模型: runs/detect/custom_yolov8m/weights/best.pt")

2.4 模型评估¶

Python

# ===================== 评估指标 =====================
model = YOLO('runs/detect/custom_yolov8m/weights/best.pt')

# 验证集评估
metrics = model.val(data='dataset/data.yaml', imgsz=640, batch=32)

print("=" * 50)
print(f"mAP@0.5:      {metrics.box.map50:.4f}")
print(f"mAP@0.5:0.95:  {metrics.box.map:.4f}")
print(f"Precision:     {metrics.box.mp:.4f}")
print(f"Recall:        {metrics.box.mr:.4f}")
print("=" * 50)

# 每类AP
for i, name in enumerate(data_config['names'].values()):
    print(f"  {name}: mAP50={metrics.box.ap50[i]:.4f}, mAP50-95={metrics.box.ap[i]:.4f}")

# ===================== 自定义评估函数 =====================
def compute_detection_metrics(pred_boxes, gt_boxes, iou_threshold=0.5):
    """
    计算检测指标
    pred_boxes: list of (x1, y1, x2, y2, conf, cls)
    gt_boxes: list of (x1, y1, x2, y2, cls)
    """
    if len(pred_boxes) == 0:
        return {'precision': 0, 'recall': 0, 'f1': 0}
    if len(gt_boxes) == 0:
        return {'precision': 0, 'recall': 1, 'f1': 0}

    # 按置信度排序
    pred_boxes = sorted(pred_boxes, key=lambda x: x[4], reverse=True)  # lambda匿名函数

    matched_gt = set()
    tp, fp = 0, 0

    for pred in pred_boxes:
        best_iou, best_gt_idx = 0, -1
        for gt_idx, gt in enumerate(gt_boxes):
            if gt_idx in matched_gt or pred[5] != gt[4]:
                continue
            iou = compute_iou_single(pred[:4], gt[:4])
            if iou > best_iou:
                best_iou, best_gt_idx = iou, gt_idx

        if best_iou >= iou_threshold:
            tp += 1
            matched_gt.add(best_gt_idx)
        else:
            fp += 1

    fn = len(gt_boxes) - len(matched_gt)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return {'precision': precision, 'recall': recall, 'f1': f1}

2.5 推理与可视化¶

Python

import cv2
import numpy as np

# ===================== 批量推理 =====================
model = YOLO('runs/detect/custom_yolov8m/weights/best.pt')

def detect_and_visualize(image_path, conf_threshold=0.5, save_path=None):
    """检测并可视化"""
    results = model.predict(
        source=image_path,
        conf=conf_threshold,
        iou=0.45,
        imgsz=640,
        device=0,
        verbose=False
    )

    result = results[0]
    img = result.orig_img.copy()

    colors = [(0,255,0), (255,0,0), (0,0,255), (255,255,0), (255,0,255)]

    for box in result.boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
        conf = box.conf[0].item()
        cls_id = int(box.cls[0].item())
        cls_name = result.names[cls_id]
        color = colors[cls_id % len(colors)]

        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
        label = f"{cls_name} {conf:.2f}"
        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
        cv2.rectangle(img, (x1, y1 - th - 8), (x1 + tw, y1), color, -1)
        cv2.putText(img, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1)

    if save_path:
        cv2.imwrite(save_path, img)
    return img, result.boxes

# 批量推理
from pathlib import Path
test_images = list(Path("test_images").glob("*.jpg"))
for img_path in test_images:
    img, boxes = detect_and_visualize(str(img_path), save_path=f"results/{img_path.name}")
    print(f"{img_path.name}: 检测到 {len(boxes)} 个目标")

# ===================== ONNX导出 =====================
model.export(format='onnx', imgsz=640, simplify=True, opset=17, dynamic=True)
print("✅ ONNX模型已导出: runs/detect/custom_yolov8m/weights/best.onnx")

🌐 P3: 多模态检索系统¶

3.1 项目简介¶

基于CLIP构建图文跨模态检索系统，支持以文搜图、以图搜图，结合向量数据库实现高效检索。

技术栈：OpenAI CLIP、FAISS、Gradio、PIL

3.2 CLIP特征提取¶

Python

import torch
import clip
from PIL import Image
import numpy as np
from pathlib import Path
from tqdm import tqdm

# ===================== 初始化CLIP =====================
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
print(f"CLIP模型已加载 ({device})")

# ===================== 图像特征提取 =====================
class CLIPFeatureExtractor:
    def __init__(self, model, preprocess, device):
        self.model = model
        self.preprocess = preprocess
        self.device = device

    @torch.no_grad()
    def encode_images(self, image_paths, batch_size=32):
        """批量提取图像特征"""
        all_features = []
        for i in range(0, len(image_paths), batch_size):
            batch_paths = image_paths[i:i + batch_size]
            images = []
            for path in batch_paths:
                try:  # try/except捕获异常
                    img = self.preprocess(Image.open(path).convert("RGB"))
                    images.append(img)
                except Exception as e:
                    print(f"  跳过 {path}: {e}")
                    continue

            if images:
                image_tensor = torch.stack(images).to(self.device)  # torch.stack沿新维度拼接张量
                features = self.model.encode_image(image_tensor)
                features = features / features.norm(dim=-1, keepdim=True)  # L2归一化
                all_features.append(features.cpu().numpy())

        return np.vstack(all_features) if all_features else np.array([])

    @torch.no_grad()
    def encode_text(self, texts):
        """提取文本特征"""
        tokens = clip.tokenize(texts, truncate=True).to(self.device)
        features = self.model.encode_text(tokens)
        features = features / features.norm(dim=-1, keepdim=True)
        return features.cpu().numpy()

extractor = CLIPFeatureExtractor(model, preprocess, device)

# 批量提取图库特征
image_dir = Path("image_gallery")
image_paths = sorted(list(image_dir.glob("**/*.jpg")) + list(image_dir.glob("**/*.png")))
print(f"共 {len(image_paths)} 张图像")

image_features = extractor.encode_images(image_paths, batch_size=64)
print(f"特征矩阵: {image_features.shape}")  # (N, 512)

3.3 FAISS向量数据库¶

Python

import faiss
import pickle

# ===================== 构建FAISS索引 =====================
class ImageSearchEngine:
    def __init__(self, feature_dim=512):
        self.feature_dim = feature_dim
        self.index = None
        self.image_paths = []

    def build_index(self, features, image_paths, use_gpu=True):
        """构建FAISS索引"""
        self.image_paths = [str(p) for p in image_paths]
        n, d = features.shape

        if n < 10000:
            # 小数据集: 精确搜索
            self.index = faiss.IndexFlatIP(d)  # 内积 (特征已归一化 = 余弦相似度)
        else:
            # 大数据集: IVF近似搜索
            nlist = min(int(np.sqrt(n)), 256)
            quantizer = faiss.IndexFlatIP(d)
            self.index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
            self.index.train(features.astype('float32'))
            self.index.nprobe = 16  # 搜索时探测的聚类数

        if use_gpu and torch.cuda.is_available():
            res = faiss.StandardGpuResources()
            self.index = faiss.index_cpu_to_gpu(res, 0, self.index)

        self.index.add(features.astype('float32'))
        print(f"✅ FAISS索引已构建: {self.index.ntotal} 向量")

    def search(self, query_features, top_k=10):
        """检索最相似的图像"""
        scores, indices = self.index.search(query_features.astype('float32'), top_k)
        results = []
        for i in range(len(query_features)):
            result = []
            for score, idx in zip(scores[i], indices[i]):  # zip按位置配对
                if idx >= 0:
                    result.append({
                        'path': self.image_paths[idx],
                        'score': float(score),
                    })
            results.append(result)
        return results

    def save(self, path):
        """保存索引"""
        cpu_index = faiss.index_gpu_to_cpu(self.index) if hasattr(self.index, 'getDevice') else self.index  # hasattr检查对象是否有某属性
        faiss.write_index(cpu_index, f"{path}.index")
        with open(f"{path}.meta", 'wb') as f:
            pickle.dump(self.image_paths, f)
        print(f"✅ 索引已保存: {path}")

    def load(self, path):
        """加载索引"""
        self.index = faiss.read_index(f"{path}.index")
        with open(f"{path}.meta", 'rb') as f:
            self.image_paths = pickle.load(f)
        print(f"✅ 索引已加载: {self.index.ntotal} 向量")

# 构建检索引擎
engine = ImageSearchEngine(feature_dim=512)
engine.build_index(image_features, image_paths)
engine.save("image_search_engine")

3.4 检索功能实现¶

Python

# ===================== 文搜图 (Text → Image) =====================
def text_to_image_search(query_text, top_k=5):
    """以文搜图"""
    text_feature = extractor.encode_text([query_text])
    results = engine.search(text_feature, top_k=top_k)

    print(f"\n🔍 查询: '{query_text}'")
    for i, res in enumerate(results[0]):
        print(f"  [{i+1}] {res['path']} (相似度: {res['score']:.4f})")
    return results[0]

# 示例
text_to_image_search("a dog playing in the park")
text_to_image_search("sunset over the ocean")

# ===================== 图搜图 (Image → Image) =====================
def image_to_image_search(query_image_path, top_k=5):
    """以图搜图"""
    query_feature = extractor.encode_images([query_image_path])
    results = engine.search(query_feature, top_k=top_k + 1)  # +1排除自身

    # 排除自身
    results_filtered = [r for r in results[0] if r['path'] != str(query_image_path)][:top_k]

    print(f"\n🔍 查询图像: {query_image_path}")
    for i, res in enumerate(results_filtered):
        print(f"  [{i+1}] {res['path']} (相似度: {res['score']:.4f})")
    return results_filtered

# ===================== 混合检索 =====================
def hybrid_search(text_query=None, image_path=None, top_k=5, text_weight=0.5):
    """图文混合检索"""
    features = []
    if text_query:
        text_feat = extractor.encode_text([text_query])
        features.append(text_feat * text_weight)
    if image_path:
        img_feat = extractor.encode_images([image_path])
        features.append(img_feat * (1 - text_weight))

    combined = sum(features)
    combined = combined / np.linalg.norm(combined, axis=-1, keepdims=True)  # np.linalg线性代数运算

    return engine.search(combined, top_k=top_k)

3.5 Gradio Web界面¶

Python

import gradio as gr

def search_by_text(query, top_k):
    text_feat = extractor.encode_text([query])
    results = engine.search(text_feat, top_k=int(top_k))
    images = []
    for res in results[0]:
        try:
            img = Image.open(res['path']).convert("RGB")
            images.append((img, f"Score: {res['score']:.3f}"))
        except:
            continue
    return images

def search_by_image(query_image, top_k):
    if query_image is None:
        return []
    query_image.save("_temp_query.jpg")
    query_feat = extractor.encode_images(["_temp_query.jpg"])
    results = engine.search(query_feat, top_k=int(top_k))
    images = []
    for res in results[0]:
        try:
            img = Image.open(res['path']).convert("RGB")
            images.append((img, f"Score: {res['score']:.3f}"))
        except:
            continue
    return images

# Gradio界面
with gr.Blocks(title="🌐 多模态图像检索系统") as demo:
    gr.Markdown("# 🌐 多模态图像检索系统\n基于CLIP + FAISS的图文跨模态检索")

    with gr.Tab("📝 以文搜图"):
        with gr.Row():
            text_input = gr.Textbox(label="输入描述", placeholder="a cat sitting on a sofa")
            text_k = gr.Slider(1, 20, value=8, step=1, label="返回数量")
        text_btn = gr.Button("🔍 搜索", variant="primary")
        text_gallery = gr.Gallery(label="检索结果", columns=4, height=400)
        text_btn.click(search_by_text, [text_input, text_k], text_gallery)

    with gr.Tab("🖼️ 以图搜图"):
        with gr.Row():
            img_input = gr.Image(label="上传查询图像", type="pil")
            img_k = gr.Slider(1, 20, value=8, step=1, label="返回数量")
        img_btn = gr.Button("🔍 搜索", variant="primary")
        img_gallery = gr.Gallery(label="检索结果", columns=4, height=400)
        img_btn.click(search_by_image, [img_input, img_k], img_gallery)

demo.launch(server_name="0.0.0.0", server_port=7860)

📊 项目总结与简历包装¶

项目成果参考¶

项目	核心指标	参考值
P1 图像分类	Top-1 Accuracy	95%+ (迁移学习)
P2 目标检测	mAP@0.5	85%+ (微调)
P3 多模态检索	Recall@10	90%+

简历描述模板¶

图像分类系统：基于ResNet50迁移学习构建图像分类服务，采用Albumentations数据增强+CutMix+MixUp策略，混合精度训练将显存占用降低50%；导出ONNX模型通过TensorRT FP16优化，推理延迟从8ms降至1.5ms，准确率95.2%。

目标检测系统：基于YOLOv8在自定义数据集上微调，实现COCO→YOLO格式转换Pipeline，采用Mosaic+CopyPaste增强策略，mAP@0.5达87.3%，支持实时视频流检测(45FPS@V100)。

多模态检索系统：基于CLIP构建图文跨模态检索引擎，FAISS IVF索引支持百万级图库毫秒级检索，Recall@10达92.1%，部署Gradio交互界面支持文搜图/图搜图/混合检索。

💡 学习建议：P1打基础 → P2学检测 → P3进多模态，每个项目先跑通再优化，最终整理到GitHub作品集。