第9章 视频分析与理解¶
📚 章节概述¶
本章介绍视频分析与理解的核心技术,包括光流估计、目标跟踪、动作识别等。视频分析是计算机视觉的重要方向,广泛应用于安防监控、体育分析、人机交互等领域。
学习时间:5-7天 难度等级:⭐⭐⭐⭐ 前置知识:第1-8章
🎯 学习目标¶
完成本章后,你将能够: - 理解视频分析的特殊性 - 掌握光流估计和目标跟踪 - 了解动作识别技术 - 能够实现视频分析应用 - 完成视频分析项目
9.1 光流估计¶
9.1.1 Lucas-Kanade光流¶
Python
import cv2
import numpy as np
def lucas_kanade_optical_flow(prev_frame, curr_frame):
"""Lucas-Kanade光流"""
prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY)
# Shi-Tomasi角点检测
prev_pts = cv2.goodFeaturesToTrack(prev_gray, maxCorners=100,
qualityLevel=0.3, minDistance=7)
# Lucas-Kanade光流
lk_params = dict(winSize=(15, 15), maxLevel=2,
criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))
next_pts, status, err = cv2.calcOpticalFlowPyrLK(prev_gray, curr_gray, prev_pts, None, **lk_params)
# 选择好的跟踪点
good_prev = prev_pts[status == 1]
good_next = next_pts[status == 1]
return good_prev, good_next
# 可视化
def visualize_optical_flow(image, prev_pts, next_pts):
result = image.copy()
for prev, next in zip(prev_pts, next_pts): # zip按位置配对
prev = prev.ravel().astype(int)
next = next.ravel().astype(int)
cv2.arrowedLine(result, tuple(prev), tuple(next), (0, 255, 0), 2)
return result
9.1.2 稠密光流(Farneback)¶
Python
def farneback_optical_flow(prev_frame, curr_frame):
"""Farneback稠密光流"""
prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY)
# 计算光流
flow = cv2.calcOpticalFlowFarneback(prev_gray, curr_gray, None,
0.5, 3, 15, 3, 5, 1.2, 0)
# 可视化
h, w = flow.shape[:2] # 切片操作,取前n个元素
fx, fy = flow[:, :, 0], flow[:, :, 1]
mag, ang = cv2.cartToPolar(fx, fy)
hsv = np.zeros((h, w, 3), dtype=np.uint8)
hsv[..., 0] = ang * 180 / np.pi / 2
hsv[..., 1] = 255
hsv[..., 2] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
return bgr
9.2 目标跟踪¶
9.2.1 KCF跟踪器¶
Python
def kcf_tracking(video_path):
"""KCF目标跟踪"""
cap = cv2.VideoCapture(video_path)
# 读取第一帧
ret, frame = cap.read()
if not ret:
return
# 选择ROI
bbox = cv2.selectROI('Select ROI', frame, fromCenter=False, showCrosshair=True)
cv2.destroyAllWindows()
# 初始化跟踪器
tracker = cv2.TrackerKCF_create()
tracker.init(frame, bbox)
# 跟踪
while True:
ret, frame = cap.read()
if not ret:
break
# 更新跟踪器
success, bbox = tracker.update(frame)
# 绘制
if success:
x, y, w, h = [int(v) for v in bbox]
cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
cv2.imshow('Tracking', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
9.2.2 DeepSORT跟踪¶
Python
from deep_sort_realtime.deepsort_tracker import DeepSort
def deepsort_tracking(video_path, detector):
"""DeepSORT多目标跟踪"""
cap = cv2.VideoCapture(video_path)
tracker = DeepSort(max_age=30)
while True:
ret, frame = cap.read()
if not ret:
break
# 检测
detections = detector.detect(frame)
# 跟踪
tracks = tracker.update_tracks(detections, frame=frame)
# 绘制
for track in tracks:
if not track.is_confirmed():
continue
track_id = track.track_id
ltrb = track.to_ltrb()
cv2.rectangle(frame, (ltrb[0], ltrb[1]), (ltrb[2], ltrb[3]), (0, 255, 0), 2)
cv2.putText(frame, f'ID: {track_id}', (ltrb[0], ltrb[1]-10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
cv2.imshow('DeepSORT', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
9.3 动作识别¶
9.3.1 3D CNN¶
Python
import torch
import torch.nn as nn
class C3D(nn.Module): # 继承nn.Module定义网络层
def __init__(self, num_classes=101):
super(C3D, self).__init__()
self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
self.fc6 = nn.Linear(8192, 4096)
self.fc7 = nn.Linear(4096, 4096)
self.fc8 = nn.Linear(4096, num_classes)
self.dropout = nn.Dropout(p=0.5)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.conv1(x))
x = self.pool1(x)
x = self.relu(self.conv2(x))
x = self.pool2(x)
x = self.relu(self.conv3a(x))
x = self.relu(self.conv3b(x))
x = self.pool3(x)
x = x.view(x.size(0), -1) # 重塑张量形状
x = self.relu(self.fc6(x))
x = self.dropout(x)
x = self.relu(self.fc7(x))
x = self.dropout(x)
x = self.fc8(x)
return x
9.3.2 Two-Stream网络¶
Python
from torchvision import models
class TwoStreamNetwork(nn.Module):
def __init__(self, num_classes=101):
super(TwoStreamNetwork, self).__init__()
# 空间流(RGB图像)
self.spatial_stream = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
self.spatial_stream.fc = nn.Linear(2048, num_classes)
# 时间流(光流)
self.temporal_stream = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
self.temporal_stream.conv1 = nn.Conv2d(20, 64, kernel_size=7, stride=2, padding=3)
self.temporal_stream.fc = nn.Linear(2048, num_classes)
def forward(self, rgb, flow):
# 空间流
spatial_feat = self.spatial_stream(rgb)
# 时间流
temporal_feat = self.temporal_stream(flow)
# 融合
fused = spatial_feat + temporal_feat
return fused
9.4 实战案例:视频目标检测与跟踪¶
Python
import cv2
import torch
class VideoTracker:
def __init__(self, detector_path='yolov5s.pt'):
# 加载检测器
self.detector = torch.hub.load('ultralytics/yolov5', 'custom', path=detector_path)
# 初始化跟踪器
self.tracker = cv2.TrackerCSRT_create()
self.tracking = False
def process_video(self, video_path, output_path):
"""处理视频"""
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# 创建视频写入器
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
frame_count += 1
# 每30帧重新检测
if frame_count % 30 == 0 or not self.tracking:
results = self.detector(frame)
if len(results.xyxy[0]) > 0:
# 选择置信度最高的目标
best = results.xyxy[0][0]
bbox = [int(best[0]), int(best[1]), int(best[2]-best[0]), int(best[3]-best[1])]
self.tracker.init(frame, bbox)
self.tracking = True
# 跟踪
if self.tracking:
success, bbox = self.tracker.update(frame)
if success:
x, y, w, h = [int(v) for v in bbox]
cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
else:
self.tracking = False
out.write(frame)
cv2.imshow('Video Tracking', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
out.release()
cv2.destroyAllWindows()
# 使用
tracker = VideoTracker()
tracker.process_video('input.mp4', 'output.mp4')
9.5 练习题¶
基础题¶
- 简答题:
- 光流估计的原理是什么?
光流估计基于亮度恒定假设(同一像素在相邻帧中亮度不变)和小运动假设,通过计算相邻帧间像素的位移向量场来表示运动。经典方法包括Lucas-Kanade(稀疏光流)和Farneback(稠密光流)。
- 目标跟踪有哪些方法?
主要方法:①相关滤波类(KCF、DCF),速度快,利用循环矩阵加速;②深度学习类(SiamFC、SiamRPN),利用孪生网络模板匹配;③多目标跟踪(SORT、DeepSORT),结合检测+关联,使用卡尔曼滤波预测和外观特征匹配。
进阶题¶
- 编程题:
- 实现一个简单的光流可视化。
- 使用DeepSORT进行多目标跟踪。
9.6 面试准备¶
大厂面试题¶
Q1: 光流估计的原理是什么?
参考答案: - 基于亮度恒定假设 - 假设像素在短时间内移动很小 - Lucas-Kanade:稀疏光流 - Farneback:稠密光流
Q2: 目标跟踪和目标检测有什么区别?
参考答案: - 检测:每帧独立检测目标 - 跟踪:跨帧关联同一目标 - 跟踪更高效,但可能漂移 - 通常结合使用
9.7 本章小结¶
核心知识点¶
- 光流估计:Lucas-Kanade、Farneback
- 目标跟踪:KCF、DeepSORT
- 动作识别:3D CNN、Two-Stream
- 视频分析:时序建模
下一步¶
下一章:10-三维视觉.md - 学习3D视觉
恭喜完成第9章! 🎉