04 - 视频生成实战¶
学习时间: 4小时 重要性: ⭐⭐⭐⭐⭐ 掌握最前沿的AIGC工程实战能力
🎯 项目目标¶
完成本项目后,你将能够: - 完成CogVideoX和Open-Sora的环境配置与模型下载 - 实现文本到视频(Text-to-Video)完整推理流程 - 实现图像到视频(Image-to-Video)生成 - 掌握视频生成的常见问题排查与效果调优技巧
1. 项目概述¶
1.1 项目简介¶
本项目将动手实现视频生成的完整流程,涵盖两个主流开源模型: - CogVideoX:基于3D因果VAE + Expert DiT的高质量视频生成 - Open-Sora:对Sora架构的开源复现,支持长视频
1.2 技术选型对比¶
| 维度 | CogVideoX-5B | Open-Sora 1.2 |
|---|---|---|
| 架构 | Expert DiT + 3D因果VAE | STDiT + 3D VAE |
| 文本编码器 | T5-XXL | T5-XXL |
| 输出 | 6秒 480p (49帧@8fps) | 最长16秒 720p |
| 推理显存 | ~18GB (FP16+offload) | ~24GB |
| 生成质量 | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐ |
| 生态完善度 | diffusers原生支持 | 自有框架 |
1.3 硬件需求¶
| 配置 | 最低要求 | 推荐配置 |
|---|---|---|
| GPU | RTX 3090 (24GB) | RTX 4090 / A100 |
| 内存 | 32GB | 64GB |
| 硬盘 | 50GB(模型权重) | 100GB+ |
| CUDA | 11.8+ | 12.1+ |
2. 环境配置¶
2.1 基础环境搭建¶
Bash
# 1. 创建conda环境
conda create -n video_gen python=3.10 -y
conda activate video_gen
# 2. 安装PyTorch (CUDA 12.1)
pip install torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu121
# 3. 安装diffusers及依赖
pip install diffusers[torch]>=0.31.0
pip install transformers>=4.44.0
pip install accelerate>=0.33.0
pip install sentencepiece protobuf
# 4. 视频处理依赖
pip install imageio[ffmpeg] opencv-python
pip install decord # 高效视频读取
2.2 模型下载¶
Python
# 方法1:使用HuggingFace Hub下载(推荐)
from huggingface_hub import snapshot_download
# CogVideoX-5B(约20GB)
snapshot_download(
repo_id="THUDM/CogVideoX-5b",
local_dir="./models/CogVideoX-5b",
# 国内用户可设置镜像
# endpoint="https://hf-mirror.com"
)
# CogVideoX-2B(约10GB,显存不够时使用)
snapshot_download(
repo_id="THUDM/CogVideoX-2b",
local_dir="./models/CogVideoX-2b",
)
Bash
# 方法2:使用huggingface-cli
# 国内用户设置镜像加速
export HF_ENDPOINT=https://hf-mirror.com
huggingface-cli download THUDM/CogVideoX-5b --local-dir ./models/CogVideoX-5b
2.3 验证环境¶
Python
import torch
import diffusers
print(f"PyTorch版本: {torch.__version__}")
print(f"CUDA可用: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"显存: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f}GB")
print(f"Diffusers版本: {diffusers.__version__}")
3. CogVideoX文本到视频¶
3.1 基础推理¶
Python
"""
CogVideoX 文本到视频 — 完整推理流程
"""
import torch
from diffusers import CogVideoXPipeline
from diffusers.utils import export_to_video
# === 1. 加载模型 ===
model_path = "THUDM/CogVideoX-5b" # 或本地路径 "./models/CogVideoX-5b"
pipe = CogVideoXPipeline.from_pretrained(
model_path,
torch_dtype=torch.bfloat16, # CogVideoX推荐bfloat16
)
# === 2. 显存优化 ===
pipe.enable_model_cpu_offload() # 按需加载到GPU
pipe.vae.enable_tiling() # VAE分块解码
pipe.vae.enable_slicing() # VAE切片处理
# === 3. 生成视频 ===
prompt = """
A majestic eagle soaring over snow-capped mountains during golden hour.
The camera slowly pans from left to right, revealing a vast landscape
with a crystal-clear lake below. Cinematic, 4K quality.
"""
# 生成参数
video_frames = pipe(
prompt=prompt,
num_videos_per_prompt=1,
num_inference_steps=50, # 采样步数(默认50)
num_frames=49, # 帧数(CogVideoX: 49帧=6秒@8fps)
guidance_scale=6.0, # CFG强度
generator=torch.Generator(device="cpu").manual_seed(42),
).frames[0]
# === 4. 保存视频 ===
export_to_video(video_frames, "text2video_output.mp4", fps=8)
print(f"视频已保存: text2video_output.mp4")
print(f"帧数: {len(video_frames)}, 分辨率: {video_frames[0].size}")
3.2 批量生成与参数调优¶
Python
"""
CogVideoX 批量生成与参数探索
"""
import torch
import os
from diffusers import CogVideoXPipeline
from diffusers.utils import export_to_video
pipe = CogVideoXPipeline.from_pretrained(
"THUDM/CogVideoX-2b", # 使用2B版本降低显存需求
torch_dtype=torch.bfloat16,
)
pipe.enable_model_cpu_offload()
pipe.vae.enable_tiling()
# 不同提示词和参数的生成实验
experiments = [
{
"prompt": "Ocean waves crashing on a tropical beach at sunset, slow motion",
"guidance_scale": 6.0,
"num_inference_steps": 50,
"seed": 42,
},
{
"prompt": "A cat sitting on a windowsill watching rain outside, cozy atmosphere",
"guidance_scale": 7.0,
"num_inference_steps": 50,
"seed": 123,
},
{
"prompt": "Timelapse of a flower blooming in a garden, macro photography",
"guidance_scale": 5.0,
"num_inference_steps": 30, # 测试较少步数
"seed": 456,
},
]
os.makedirs("outputs", exist_ok=True)
for i, exp in enumerate(experiments): # enumerate同时获取索引和元素
print(f"\n[{i+1}/{len(experiments)}] 生成: {exp['prompt'][:50]}...") # 切片操作,取前n个元素
video = pipe(
prompt=exp["prompt"],
num_frames=49,
guidance_scale=exp["guidance_scale"],
num_inference_steps=exp["num_inference_steps"],
generator=torch.Generator("cpu").manual_seed(exp["seed"]),
).frames[0]
output_path = f"outputs/exp_{i+1}_cfg{exp['guidance_scale']}_steps{exp['num_inference_steps']}.mp4"
export_to_video(video, output_path, fps=8)
print(f" 已保存: {output_path}")
3.3 Prompt工程技巧¶
Python
# === CogVideoX的Prompt最佳实践 ===
# ✅ 好的prompt:详细、结构化
good_prompt = """
A young woman with long black hair walks through a Japanese garden in autumn.
Cherry blossom petals fall gently around her.
The camera follows her from behind, slowly revealing a traditional wooden bridge.
Warm afternoon sunlight filters through the red maple leaves.
Cinematic, shallow depth of field, 4K, Film grain.
"""
# ❌ 差的prompt:太短、缺乏细节
bad_prompt = "woman walking in garden"
# 📌 Prompt结构建议:
# 1. 主体描述(who/what)
# 2. 场景环境(where)
# 3. 动作/运动(action)
# 4. 摄像机运动(camera movement)
# 5. 光照/氛围(lighting/mood)
# 6. 质量标签(quality tags)
4. CogVideoX图像到视频¶
4.1 Image-to-Video推理¶
Python
"""
CogVideoX 图像到视频 — 从单张图片生成动态视频
"""
import torch
from diffusers import CogVideoXImageToVideoPipeline
from diffusers.utils import export_to_video, load_image
# 加载Image-to-Video模型
pipe = CogVideoXImageToVideoPipeline.from_pretrained(
"THUDM/CogVideoX-5b-I2V",
torch_dtype=torch.bfloat16,
)
pipe.enable_model_cpu_offload()
pipe.vae.enable_tiling()
# 加载输入图像(建议720×480或480×720)
image = load_image("input_scene.png").resize((720, 480))
prompt = "The scene comes alive with gentle wind blowing through the trees, birds flying across the sky"
video = pipe(
prompt=prompt,
image=image,
num_frames=49,
num_inference_steps=50,
guidance_scale=6.0,
generator=torch.Generator("cpu").manual_seed(42),
).frames[0]
export_to_video(video, "img2video_output.mp4", fps=8)
print("图像到视频生成完成!")
5. Open-Sora视频生成¶
5.1 环境配置¶
Bash
# 克隆Open-Sora仓库
git clone https://github.com/hpcaitech/Open-Sora.git
cd Open-Sora
# 安装依赖
pip install -e .
# 或
pip install -r requirements.txt
5.2 使用Open-Sora CLI生成¶
Bash
# 文本到视频 — 命令行方式
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
--prompt "A serene lake surrounded by mountains at dawn" \
--num-frames 51 \
--resolution 480p \
--num-sampling-steps 30
5.3 使用Python API生成¶
Python
"""
Open-Sora 文本到视频推理
注意:需要在Open-Sora仓库目录下运行
"""
import torch
from opensora.utils.inference_utils import (
load_model,
get_save_path_name,
save_sample,
)
# 加载模型配置
config = {
"model_type": "STDiT2-XL/2",
"resolution": "480p",
"num_frames": 51,
"fps": 24,
}
# 初始化模型
model, vae, text_encoder, scheduler = load_model(
model_path="hpcai-tech/OpenSora-STDiT-v3",
config=config,
)
# 生成
prompt = "Aerial view of a coastal city with waves crashing on the shore, golden hour lighting"
samples = model.generate(
prompt=prompt,
num_frames=config["num_frames"],
resolution=config["resolution"],
num_sampling_steps=30,
cfg_scale=7.0,
)
# 保存
save_sample(samples, "opensora_output.mp4", fps=24)
6. 常见问题与调优¶
6.1 显存不足(OOM)¶
Python
# === 显存优化方案 ===
# 方案1: 降低精度
pipe = CogVideoXPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
# 方案2: CPU Offload + VAE优化(最推荐)
pipe.enable_model_cpu_offload()
pipe.vae.enable_tiling()
pipe.vae.enable_slicing()
# 方案3: 使用更小的模型
# CogVideoX: 5B → 2B
# Open-Sora: 选择较小的STDiT变体
# 方案4: 减少帧数和分辨率
video = pipe(prompt=prompt, num_frames=25, height=320, width=480)
# 方案5: 序列并行 (多GPU)
# CogVideoX支持在多张GPU上分布式推理
pipe.enable_sequential_cpu_offload() # 逐层卸载,比enable_model_cpu_offload更激进
6.2 生成质量问题排查¶
Python
# === 常见质量问题与解决方案 ===
# 问题1: 运动不自然/抖动
# 解决: 增加采样步数,适当降低CFG
video = pipe(prompt=prompt, num_inference_steps=50, guidance_scale=5.0)
# 问题2: 时间不一致(物体闪烁/变形)
# 解决: 使用更长的负面提示词
negative = "flickering, morphing, inconsistent, blurry, distorted, low quality"
# 问题3: 内容与文本不匹配
# 解决: 改进prompt,使用更精确的描述
# 加入: 主语、动作、场景、摄像机运动、光照、风格
# 问题4: 生成速度慢
# 解决: 减少步数 + 使用torch.compile
pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
6.3 参数调优指南¶
| 参数 | 范围 | 建议 | 影响 |
|---|---|---|---|
num_inference_steps | 20-100 | 50 | 步数越多质量越高,速度越慢 |
guidance_scale | 1.0-15.0 | 5.0-7.0 | 过高导致过饱和,过低不忠于文本 |
num_frames | 13-49 | 49 | CogVideoX最大49帧 |
seed | 任意整数 | 多试几个 | 不同seed差异很大 |
6.4 视频后处理¶
Python
"""
视频后处理工具
"""
import subprocess
import os
def upscale_video(input_path, output_path, scale=2):
"""使用ffmpeg进行视频超分辨率(双三次插值)"""
cmd = f'ffmpeg -i {input_path} -vf "scale=iw*{scale}:ih*{scale}:flags=bicubic" -c:v libx264 -crf 18 {output_path}'
subprocess.run(cmd, shell=True)
def interpolate_frames(input_path, output_path, target_fps=24):
"""帧插值提高帧率"""
cmd = f'ffmpeg -i {input_path} -filter:v "minterpolate=fps={target_fps}:mi_mode=mci" -c:v libx264 -crf 18 {output_path}'
subprocess.run(cmd, shell=True)
def add_audio(video_path, audio_path, output_path):
"""为生成的视频添加背景音乐"""
cmd = f'ffmpeg -i {video_path} -i {audio_path} -c:v copy -c:a aac -shortest {output_path}'
subprocess.run(cmd, shell=True)
def create_loop(input_path, output_path, loops=3):
"""创建循环视频"""
cmd = f'ffmpeg -stream_loop {loops-1} -i {input_path} -c copy {output_path}'
subprocess.run(cmd, shell=True)
# 使用示例
# upscale_video("output_480p.mp4", "output_960p.mp4", scale=2)
# interpolate_frames("output_8fps.mp4", "output_24fps.mp4", target_fps=24)
7. 完整项目流程示例¶
Python
"""
完整视频生成项目 — 从prompt到最终输出
"""
import torch
import os
import time
from diffusers import CogVideoXPipeline
from diffusers.utils import export_to_video
def generate_video(
prompt: str,
output_dir: str = "project_outputs",
model_id: str = "THUDM/CogVideoX-2b",
num_frames: int = 49,
num_steps: int = 50,
guidance_scale: float = 6.0,
seed: int = 42,
):
"""端到端视频生成函数"""
os.makedirs(output_dir, exist_ok=True)
print("=" * 60)
print(f"视频生成项目")
print(f"模型: {model_id}")
print(f"Prompt: {prompt[:80]}...")
print("=" * 60)
# 1. 加载模型
print("\n[1/4] 加载模型...")
t0 = time.time()
pipe = CogVideoXPipeline.from_pretrained(
model_id, torch_dtype=torch.bfloat16
)
pipe.enable_model_cpu_offload()
pipe.vae.enable_tiling()
print(f" 模型加载耗时: {time.time()-t0:.1f}s")
# 2. 生成视频
print("\n[2/4] 生成视频...")
t1 = time.time()
result = pipe(
prompt=prompt,
num_frames=num_frames,
num_inference_steps=num_steps,
guidance_scale=guidance_scale,
generator=torch.Generator("cpu").manual_seed(seed),
)
gen_time = time.time() - t1
print(f" 生成耗时: {gen_time:.1f}s")
# 3. 保存结果
print("\n[3/4] 保存视频...")
video_frames = result.frames[0]
output_path = os.path.join(output_dir, f"video_seed{seed}.mp4")
export_to_video(video_frames, output_path, fps=8)
# 4. 输出摘要
print("\n[4/4] 生成摘要:")
print(f" 输出路径: {output_path}")
print(f" 帧数: {len(video_frames)}")
print(f" 分辨率: {video_frames[0].size}")
print(f" 时长: {len(video_frames)/8:.1f}秒")
print(f" 总耗时: {time.time()-t0:.1f}s")
return output_path
if __name__ == "__main__":
prompts = [
"A time-lapse of clouds moving over a mountain range, dramatic lighting, 4K cinematic",
"Close-up of coffee being poured into a white cup, steam rising, warm morning light",
]
for i, prompt in enumerate(prompts):
print(f"\n{'='*60}")
print(f"生成第 {i+1}/{len(prompts)} 个视频")
generate_video(prompt=prompt, seed=i*100)
📋 面试要点¶
高频面试题¶
- CogVideoX推理时如何优化显存?
enable_model_cpu_offload():DiT/VAE/T5按需加载到GPUvae.enable_tiling():VAE分块编解码避免一次性处理全部帧- 使用bfloat16精度减少约50%显存
-
必要时选择2B模型替代5B模型
-
视频生成中guidance_scale如何选择?
- 过低(<3.0):生成内容与文本不匹配
- 适中(5.0-7.0):质量与一致性的最佳平衡
- 过高(>10.0):过饱和、伪影、运动僵硬
-
LCM加速模型使用更低的CFG(1.0-2.0)
-
如何提升生成视频的时间一致性?
- 使用足够的采样步数(≥50步)
- 适中的CFG避免过度约束
- prompt中加入运动描述帮助模型理解动态
- 后处理:帧插值平滑过渡
✏️ 练习¶
练习1:环境搭建¶
按照教程完成CogVideoX的完整环境配置,成功生成第一个视频并记录显存和时间。
练习2:Prompt实验¶
设计5个不同类型的prompt(自然风景、人物动作、特效场景、微距摄影、动漫风格),生成视频并对比效果。
练习3:参数消融¶
固定prompt和seed,分别测试guidance_scale从3.0到10.0(步长1.0)的效果,记录每次的主观质量评分。
练习4:完整项目¶
构建一个简单的视频生成Web服务: - 使用Gradio搭建前端界面 - 接收用户的文本/图像输入 - 调用CogVideoX生成视频并返回 - 支持参数调节(步数、CFG、帧数)
参考资源¶
- CogVideoX GitHub: https://github.com/THUDM/CogVideo
- Open-Sora GitHub: https://github.com/hpcaitech/Open-Sora
- Diffusers文档: https://huggingface.co/docs/diffusers
- HuggingFace镜像(国内加速): https://hf-mirror.com