🚀 计算机视觉实战项目¶
⚠️ 时效性说明:本章涉及前沿模型/价格/榜单等信息,可能随版本快速变化;请以论文原文、官方发布页和 API 文档为准。
3个完整项目:图像分类 → 目标检测 → 多模态检索,覆盖训练、评估、部署全流程,附完整可运行代码。
📋 项目总览¶
| 项目 | 核心技术 | 难度 | 预计耗时 |
|---|---|---|---|
| P1: 图像分类系统 | ResNet/数据增强/AMP/ONNX | ⭐⭐⭐ | 2天 |
| P2: 目标检测系统 | YOLOv8微调/自定义数据 | ⭐⭐⭐⭐ | 3天 |
| P3: 多模态检索系统 | CLIP/向量数据库/图文检索 | ⭐⭐⭐⭐⭐ | 3天 |
🖼️ P1: 图像分类系统¶
1.1 项目简介¶
构建一个生产级图像分类系统,使用ResNet在自定义数据集上微调,包含完整的数据增强、混合精度训练、模型导出和推理部署流程。
技术栈:PyTorch、torchvision、ONNX Runtime、Albumentations
1.2 数据准备与增强¶
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2
import numpy as np
import os
# ===================== 数据增强 =====================
train_transform = A.Compose([
A.RandomResizedCrop(224, 224, scale=(0.8, 1.0)),
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.2),
A.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1, p=0.8),
A.OneOf([
A.GaussianBlur(blur_limit=7),
A.GaussNoise(var_limit=(10, 50)),
A.MotionBlur(blur_limit=7),
], p=0.3),
A.CoarseDropout(max_holes=8, max_height=32, max_width=32, p=0.3), # Cutout
A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
ToTensorV2(),
])
val_transform = A.Compose([
A.Resize(256, 256),
A.CenterCrop(224, 224),
A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
ToTensorV2(),
])
# ===================== 数据集 =====================
class ImageClassificationDataset(Dataset):
"""自定义图像分类数据集
目录结构:
data/
train/
class_0/ img1.jpg, img2.jpg, ...
class_1/ img1.jpg, img2.jpg, ...
val/
class_0/ ...
class_1/ ...
"""
def __init__(self, root_dir, transform=None):
self.root_dir = root_dir
self.transform = transform
self.classes = sorted(os.listdir(root_dir))
self.class_to_idx = {c: i for i, c in enumerate(self.classes)} # enumerate同时获取索引和元素
self.samples = []
for cls_name in self.classes:
cls_dir = os.path.join(root_dir, cls_name)
for fname in os.listdir(cls_dir):
if fname.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
self.samples.append((os.path.join(cls_dir, fname), self.class_to_idx[cls_name]))
def __len__(self): # __len__定义len()行为
return len(self.samples)
def __getitem__(self, idx): # __getitem__定义索引访问行为
img_path, label = self.samples[idx]
image = np.array(Image.open(img_path).convert("RGB")) # np.array创建NumPy数组
if self.transform:
image = self.transform(image=image)["image"]
return image, label
# 创建数据加载器
train_dataset = ImageClassificationDataset("data/train", transform=train_transform)
val_dataset = ImageClassificationDataset("data/val", transform=val_transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True) # DataLoader批量加载数据
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)
print(f"训练集: {len(train_dataset)} 张, {len(train_dataset.classes)} 类")
print(f"验证集: {len(val_dataset)} 张")
1.3 模型构建与训练¶
from torch.amp import autocast, GradScaler
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from sklearn.metrics import classification_report, confusion_matrix
import time
# ===================== 模型定义 =====================
class ImageClassifier(nn.Module): # 继承nn.Module定义网络层
def __init__(self, num_classes, pretrained=True):
super().__init__() # super()调用父类方法
self.backbone = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2 if pretrained else None)
in_features = self.backbone.fc.in_features
self.backbone.fc = nn.Sequential(
nn.Dropout(0.3),
nn.Linear(in_features, 512),
nn.ReLU(inplace=True),
nn.Dropout(0.2),
nn.Linear(512, num_classes)
)
def forward(self, x):
return self.backbone(x)
# ===================== 训练循环 =====================
def train_one_epoch(model, loader, criterion, optimizer, scaler, device):
model.train() # train()训练模式
total_loss, correct, total = 0, 0, 0
for images, labels in loader:
images, labels = images.to(device), labels.to(device) # 移至GPU/CPU
optimizer.zero_grad() # 清零梯度
with autocast('cuda'): # 混合精度
outputs = model(images)
loss = criterion(outputs, labels)
scaler.scale(loss).backward() # 反向传播计算梯度
scaler.unscale_(optimizer)
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
scaler.step(optimizer)
scaler.update()
total_loss += loss.item() * images.size(0) # 将单元素张量转为Python数值
_, preds = outputs.max(1)
correct += preds.eq(labels).sum().item()
total += labels.size(0)
return total_loss / total, correct / total
@torch.no_grad() # 禁用梯度计算,节省内存
def evaluate(model, loader, criterion, device):
model.eval()
total_loss, correct, total = 0, 0, 0
all_preds, all_labels = [], []
for images, labels in loader:
images, labels = images.to(device), labels.to(device)
with autocast('cuda'):
outputs = model(images)
loss = criterion(outputs, labels)
total_loss += loss.item() * images.size(0)
_, preds = outputs.max(1)
correct += preds.eq(labels).sum().item()
total += labels.size(0)
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
return total_loss / total, correct / total, all_preds, all_labels
# ===================== 主训练流程 =====================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = len(train_dataset.classes)
model = ImageClassifier(num_classes).to(device)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)
scaler = GradScaler()
best_acc = 0
num_epochs = 30
for epoch in range(num_epochs):
start = time.time()
train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, scaler, device)
val_loss, val_acc, preds, labels = evaluate(model, val_loader, criterion, device)
scheduler.step() # 更新参数
elapsed = time.time() - start
print(f"Epoch {epoch+1}/{num_epochs} ({elapsed:.1f}s) | "
f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | "
f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f}")
if val_acc > best_acc:
best_acc = val_acc
torch.save(model.state_dict(), "best_model.pth")
print(f" ✅ 保存最佳模型, Acc={best_acc:.4f}")
# 最终评估
model.load_state_dict(torch.load("best_model.pth", weights_only=True))
_, _, preds, labels = evaluate(model, val_loader, criterion, device)
print("\n" + classification_report(labels, preds, target_names=train_dataset.classes))
1.4 ONNX导出与推理部署¶
import onnxruntime as ort
# ===================== ONNX导出 =====================
model.eval()
dummy_input = torch.randn(1, 3, 224, 224).to(device)
torch.onnx.export(
model, dummy_input, "classifier.onnx",
input_names=["image"],
output_names=["logits"],
dynamic_axes={"image": {0: "batch"}, "logits": {0: "batch"}},
opset_version=17
)
print("✅ ONNX模型已导出")
# ===================== ONNX推理 =====================
class ONNXClassifier:
def __init__(self, model_path, class_names):
self.session = ort.InferenceSession(
model_path,
providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
)
self.class_names = class_names
self.transform = val_transform
def predict(self, image_path):
image = np.array(Image.open(image_path).convert("RGB"))
input_tensor = self.transform(image=image)["image"].unsqueeze(0).numpy() # unsqueeze增加一个维度
outputs = self.session.run(None, {"image": input_tensor})
probs = self._softmax(outputs[0][0])
top_idx = probs.argmax()
return {
"class": self.class_names[top_idx],
"confidence": float(probs[top_idx]),
"top5": [(self.class_names[i], float(probs[i]))
for i in probs.argsort()[::-1][:5]] # 切片操作,取前n个元素
}
@staticmethod # @staticmethod不需要实例即可调用
def _softmax(x):
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
# 使用示例
classifier = ONNXClassifier("classifier.onnx", train_dataset.classes)
result = classifier.predict("test_image.jpg")
print(f"预测: {result['class']} ({result['confidence']:.2%})")
for cls, prob in result['top5']:
print(f" {cls}: {prob:.2%}")
🔍 P2: 目标检测系统¶
2.1 项目简介¶
使用YOLOv8在自定义数据集上微调目标检测模型,覆盖数据标注格式、训练配置、模型评估和推理部署全流程。
技术栈:Ultralytics YOLOv8、OpenCV、COCO格式
2.2 数据集准备¶
import yaml
import shutil
from pathlib import Path
# ===================== YOLO数据集目录结构 =====================
"""
dataset/
├── images/
│ ├── train/ # 训练图像
│ └── val/ # 验证图像
├── labels/
│ ├── train/ # 训练标签 (YOLO格式txt)
│ └── val/ # 验证标签
└── data.yaml # 数据集配置
"""
# ===================== 数据集配置文件 =====================
data_config = {
'path': './dataset',
'train': 'images/train',
'val': 'images/val',
'names': {
0: 'person',
1: 'car',
2: 'bicycle',
3: 'dog',
4: 'cat',
}
}
with open('dataset/data.yaml', 'w') as f: # with自动管理文件关闭
yaml.dump(data_config, f, default_flow_style=False)
# ===================== COCO → YOLO 格式转换 =====================
import json
def coco_to_yolo(coco_json_path, output_dir, image_dir):
"""将COCO格式标注转换为YOLO格式"""
with open(coco_json_path) as f:
coco = json.load(f)
# 图像信息映射
img_map = {img['id']: img for img in coco['images']}
# 类别ID映射 (COCO ID → 连续ID)
cat_map = {cat['id']: i for i, cat in enumerate(coco['categories'])}
Path(output_dir).mkdir(parents=True, exist_ok=True)
# 按图像分组标注
from collections import defaultdict
ann_by_img = defaultdict(list) # defaultdict访问不存在的键时返回默认值
for ann in coco['annotations']:
ann_by_img[ann['image_id']].append(ann)
for img_id, img_info in img_map.items():
w, h = img_info['width'], img_info['height']
label_file = Path(output_dir) / (Path(img_info['file_name']).stem + '.txt')
lines = []
for ann in ann_by_img.get(img_id, []):
cls_id = cat_map[ann['category_id']]
x, y, bw, bh = ann['bbox'] # COCO: (x_min, y_min, width, height)
# 转YOLO: (cx, cy, w, h) 归一化
cx = (x + bw / 2) / w
cy = (y + bh / 2) / h
bw /= w
bh /= h
lines.append(f"{cls_id} {cx:.6f} {cy:.6f} {bw:.6f} {bh:.6f}")
label_file.write_text('\n'.join(lines))
print(f"✅ 转换完成: {len(img_map)} 张图像的标签已保存到 {output_dir}")
# 使用示例
coco_to_yolo("annotations/instances_train.json", "dataset/labels/train", "dataset/images/train")
2.3 YOLOv8 训练¶
from ultralytics import YOLO
# ===================== 模型训练 =====================
# 加载预训练模型
model = YOLO('yolov8m.pt') # 中等大小,平衡精度与速度
# 训练配置
results = model.train(
data='dataset/data.yaml',
epochs=100,
imgsz=640,
batch=16,
device=0,
# 优化器配置
optimizer='AdamW',
lr0=0.001,
lrf=0.01, # 最终学习率 = lr0 * lrf
weight_decay=0.0005,
warmup_epochs=3,
# 数据增强
mosaic=1.0, # Mosaic增强
mixup=0.1, # MixUp增强
copy_paste=0.1, # Copy-Paste增强
hsv_h=0.015, # 色调
hsv_s=0.7, # 饱和度
hsv_v=0.4, # 亮度
degrees=10.0, # 旋转
translate=0.1, # 平移
scale=0.5, # 缩放
fliplr=0.5, # 水平翻转
# 保存配置
project='runs/detect',
name='custom_yolov8m',
save=True,
save_period=10, # 每10epoch保存一次
patience=20, # 早停
)
print(f"✅ 训练完成! 最佳模型: runs/detect/custom_yolov8m/weights/best.pt")
2.4 模型评估¶
# ===================== 评估指标 =====================
model = YOLO('runs/detect/custom_yolov8m/weights/best.pt')
# 验证集评估
metrics = model.val(data='dataset/data.yaml', imgsz=640, batch=32)
print("=" * 50)
print(f"mAP@0.5: {metrics.box.map50:.4f}")
print(f"mAP@0.5:0.95: {metrics.box.map:.4f}")
print(f"Precision: {metrics.box.mp:.4f}")
print(f"Recall: {metrics.box.mr:.4f}")
print("=" * 50)
# 每类AP
for i, name in enumerate(data_config['names'].values()):
print(f" {name}: mAP50={metrics.box.ap50[i]:.4f}, mAP50-95={metrics.box.ap[i]:.4f}")
# ===================== 自定义评估函数 =====================
def compute_detection_metrics(pred_boxes, gt_boxes, iou_threshold=0.5):
"""
计算检测指标
pred_boxes: list of (x1, y1, x2, y2, conf, cls)
gt_boxes: list of (x1, y1, x2, y2, cls)
"""
if len(pred_boxes) == 0:
return {'precision': 0, 'recall': 0, 'f1': 0}
if len(gt_boxes) == 0:
return {'precision': 0, 'recall': 1, 'f1': 0}
# 按置信度排序
pred_boxes = sorted(pred_boxes, key=lambda x: x[4], reverse=True) # lambda匿名函数
matched_gt = set()
tp, fp = 0, 0
for pred in pred_boxes:
best_iou, best_gt_idx = 0, -1
for gt_idx, gt in enumerate(gt_boxes):
if gt_idx in matched_gt or pred[5] != gt[4]:
continue
iou = compute_iou_single(pred[:4], gt[:4])
if iou > best_iou:
best_iou, best_gt_idx = iou, gt_idx
if best_iou >= iou_threshold:
tp += 1
matched_gt.add(best_gt_idx)
else:
fp += 1
fn = len(gt_boxes) - len(matched_gt)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return {'precision': precision, 'recall': recall, 'f1': f1}
2.5 推理与可视化¶
import cv2
import numpy as np
# ===================== 批量推理 =====================
model = YOLO('runs/detect/custom_yolov8m/weights/best.pt')
def detect_and_visualize(image_path, conf_threshold=0.5, save_path=None):
"""检测并可视化"""
results = model.predict(
source=image_path,
conf=conf_threshold,
iou=0.45,
imgsz=640,
device=0,
verbose=False
)
result = results[0]
img = result.orig_img.copy()
colors = [(0,255,0), (255,0,0), (0,0,255), (255,255,0), (255,0,255)]
for box in result.boxes:
x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
conf = box.conf[0].item()
cls_id = int(box.cls[0].item())
cls_name = result.names[cls_id]
color = colors[cls_id % len(colors)]
cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
label = f"{cls_name} {conf:.2f}"
(tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
cv2.rectangle(img, (x1, y1 - th - 8), (x1 + tw, y1), color, -1)
cv2.putText(img, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1)
if save_path:
cv2.imwrite(save_path, img)
return img, result.boxes
# 批量推理
from pathlib import Path
test_images = list(Path("test_images").glob("*.jpg"))
for img_path in test_images:
img, boxes = detect_and_visualize(str(img_path), save_path=f"results/{img_path.name}")
print(f"{img_path.name}: 检测到 {len(boxes)} 个目标")
# ===================== ONNX导出 =====================
model.export(format='onnx', imgsz=640, simplify=True, opset=17, dynamic=True)
print("✅ ONNX模型已导出: runs/detect/custom_yolov8m/weights/best.onnx")
🌐 P3: 多模态检索系统¶
3.1 项目简介¶
基于CLIP构建图文跨模态检索系统,支持以文搜图、以图搜图,结合向量数据库实现高效检索。
技术栈:OpenAI CLIP、FAISS、Gradio、PIL
3.2 CLIP特征提取¶
import torch
import clip
from PIL import Image
import numpy as np
from pathlib import Path
from tqdm import tqdm
# ===================== 初始化CLIP =====================
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
print(f"CLIP模型已加载 ({device})")
# ===================== 图像特征提取 =====================
class CLIPFeatureExtractor:
def __init__(self, model, preprocess, device):
self.model = model
self.preprocess = preprocess
self.device = device
@torch.no_grad()
def encode_images(self, image_paths, batch_size=32):
"""批量提取图像特征"""
all_features = []
for i in range(0, len(image_paths), batch_size):
batch_paths = image_paths[i:i + batch_size]
images = []
for path in batch_paths:
try: # try/except捕获异常
img = self.preprocess(Image.open(path).convert("RGB"))
images.append(img)
except Exception as e:
print(f" 跳过 {path}: {e}")
continue
if images:
image_tensor = torch.stack(images).to(self.device) # torch.stack沿新维度拼接张量
features = self.model.encode_image(image_tensor)
features = features / features.norm(dim=-1, keepdim=True) # L2归一化
all_features.append(features.cpu().numpy())
return np.vstack(all_features) if all_features else np.array([])
@torch.no_grad()
def encode_text(self, texts):
"""提取文本特征"""
tokens = clip.tokenize(texts, truncate=True).to(self.device)
features = self.model.encode_text(tokens)
features = features / features.norm(dim=-1, keepdim=True)
return features.cpu().numpy()
extractor = CLIPFeatureExtractor(model, preprocess, device)
# 批量提取图库特征
image_dir = Path("image_gallery")
image_paths = sorted(list(image_dir.glob("**/*.jpg")) + list(image_dir.glob("**/*.png")))
print(f"共 {len(image_paths)} 张图像")
image_features = extractor.encode_images(image_paths, batch_size=64)
print(f"特征矩阵: {image_features.shape}") # (N, 512)
3.3 FAISS向量数据库¶
import faiss
import pickle
# ===================== 构建FAISS索引 =====================
class ImageSearchEngine:
def __init__(self, feature_dim=512):
self.feature_dim = feature_dim
self.index = None
self.image_paths = []
def build_index(self, features, image_paths, use_gpu=True):
"""构建FAISS索引"""
self.image_paths = [str(p) for p in image_paths]
n, d = features.shape
if n < 10000:
# 小数据集: 精确搜索
self.index = faiss.IndexFlatIP(d) # 内积 (特征已归一化 = 余弦相似度)
else:
# 大数据集: IVF近似搜索
nlist = min(int(np.sqrt(n)), 256)
quantizer = faiss.IndexFlatIP(d)
self.index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
self.index.train(features.astype('float32'))
self.index.nprobe = 16 # 搜索时探测的聚类数
if use_gpu and torch.cuda.is_available():
res = faiss.StandardGpuResources()
self.index = faiss.index_cpu_to_gpu(res, 0, self.index)
self.index.add(features.astype('float32'))
print(f"✅ FAISS索引已构建: {self.index.ntotal} 向量")
def search(self, query_features, top_k=10):
"""检索最相似的图像"""
scores, indices = self.index.search(query_features.astype('float32'), top_k)
results = []
for i in range(len(query_features)):
result = []
for score, idx in zip(scores[i], indices[i]): # zip按位置配对
if idx >= 0:
result.append({
'path': self.image_paths[idx],
'score': float(score),
})
results.append(result)
return results
def save(self, path):
"""保存索引"""
cpu_index = faiss.index_gpu_to_cpu(self.index) if hasattr(self.index, 'getDevice') else self.index # hasattr检查对象是否有某属性
faiss.write_index(cpu_index, f"{path}.index")
with open(f"{path}.meta", 'wb') as f:
pickle.dump(self.image_paths, f)
print(f"✅ 索引已保存: {path}")
def load(self, path):
"""加载索引"""
self.index = faiss.read_index(f"{path}.index")
with open(f"{path}.meta", 'rb') as f:
self.image_paths = pickle.load(f)
print(f"✅ 索引已加载: {self.index.ntotal} 向量")
# 构建检索引擎
engine = ImageSearchEngine(feature_dim=512)
engine.build_index(image_features, image_paths)
engine.save("image_search_engine")
3.4 检索功能实现¶
# ===================== 文搜图 (Text → Image) =====================
def text_to_image_search(query_text, top_k=5):
"""以文搜图"""
text_feature = extractor.encode_text([query_text])
results = engine.search(text_feature, top_k=top_k)
print(f"\n🔍 查询: '{query_text}'")
for i, res in enumerate(results[0]):
print(f" [{i+1}] {res['path']} (相似度: {res['score']:.4f})")
return results[0]
# 示例
text_to_image_search("a dog playing in the park")
text_to_image_search("sunset over the ocean")
# ===================== 图搜图 (Image → Image) =====================
def image_to_image_search(query_image_path, top_k=5):
"""以图搜图"""
query_feature = extractor.encode_images([query_image_path])
results = engine.search(query_feature, top_k=top_k + 1) # +1排除自身
# 排除自身
results_filtered = [r for r in results[0] if r['path'] != str(query_image_path)][:top_k]
print(f"\n🔍 查询图像: {query_image_path}")
for i, res in enumerate(results_filtered):
print(f" [{i+1}] {res['path']} (相似度: {res['score']:.4f})")
return results_filtered
# ===================== 混合检索 =====================
def hybrid_search(text_query=None, image_path=None, top_k=5, text_weight=0.5):
"""图文混合检索"""
features = []
if text_query:
text_feat = extractor.encode_text([text_query])
features.append(text_feat * text_weight)
if image_path:
img_feat = extractor.encode_images([image_path])
features.append(img_feat * (1 - text_weight))
combined = sum(features)
combined = combined / np.linalg.norm(combined, axis=-1, keepdims=True) # np.linalg线性代数运算
return engine.search(combined, top_k=top_k)
3.5 Gradio Web界面¶
import gradio as gr
def search_by_text(query, top_k):
text_feat = extractor.encode_text([query])
results = engine.search(text_feat, top_k=int(top_k))
images = []
for res in results[0]:
try:
img = Image.open(res['path']).convert("RGB")
images.append((img, f"Score: {res['score']:.3f}"))
except:
continue
return images
def search_by_image(query_image, top_k):
if query_image is None:
return []
query_image.save("_temp_query.jpg")
query_feat = extractor.encode_images(["_temp_query.jpg"])
results = engine.search(query_feat, top_k=int(top_k))
images = []
for res in results[0]:
try:
img = Image.open(res['path']).convert("RGB")
images.append((img, f"Score: {res['score']:.3f}"))
except:
continue
return images
# Gradio界面
with gr.Blocks(title="🌐 多模态图像检索系统") as demo:
gr.Markdown("# 🌐 多模态图像检索系统\n基于CLIP + FAISS的图文跨模态检索")
with gr.Tab("📝 以文搜图"):
with gr.Row():
text_input = gr.Textbox(label="输入描述", placeholder="a cat sitting on a sofa")
text_k = gr.Slider(1, 20, value=8, step=1, label="返回数量")
text_btn = gr.Button("🔍 搜索", variant="primary")
text_gallery = gr.Gallery(label="检索结果", columns=4, height=400)
text_btn.click(search_by_text, [text_input, text_k], text_gallery)
with gr.Tab("🖼️ 以图搜图"):
with gr.Row():
img_input = gr.Image(label="上传查询图像", type="pil")
img_k = gr.Slider(1, 20, value=8, step=1, label="返回数量")
img_btn = gr.Button("🔍 搜索", variant="primary")
img_gallery = gr.Gallery(label="检索结果", columns=4, height=400)
img_btn.click(search_by_image, [img_input, img_k], img_gallery)
demo.launch(server_name="0.0.0.0", server_port=7860)
📊 项目总结与简历包装¶
项目成果参考¶
| 项目 | 核心指标 | 参考值 |
|---|---|---|
| P1 图像分类 | Top-1 Accuracy | 95%+ (迁移学习) |
| P2 目标检测 | mAP@0.5 | 85%+ (微调) |
| P3 多模态检索 | Recall@10 | 90%+ |
简历描述模板¶
图像分类系统:基于ResNet50迁移学习构建图像分类服务,采用Albumentations数据增强+CutMix+MixUp策略,混合精度训练将显存占用降低50%;导出ONNX模型通过TensorRT FP16优化,推理延迟从8ms降至1.5ms,准确率95.2%。
目标检测系统:基于YOLOv8在自定义数据集上微调,实现COCO→YOLO格式转换Pipeline,采用Mosaic+CopyPaste增强策略,mAP@0.5达87.3%,支持实时视频流检测(45FPS@V100)。
多模态检索系统:基于CLIP构建图文跨模态检索引擎,FAISS IVF索引支持百万级图库毫秒级检索,Recall@10达92.1%,部署Gradio交互界面支持文搜图/图搜图/混合检索。
💡 学习建议:P1打基础 → P2学检测 → P3进多模态,每个项目先跑通再优化,最终整理到GitHub作品集。