Image 和 Video 的处理 Pipeline

January 20, 2026

后续处理多模数据会比较多,还在更新中...(应该会涉及到很多使用模型的处理链路)

Image

主要用 PIL 这个库

img = Image.open("sample.jpg")

print(img.size)
print(img.mode) # RGB, GRAY

# resize,等比例缩放,但一般不常用,因为会破坏 obj 形状
img_2 = img.resize((224, 224))

# crop
img_2 = img_2.crop((100, 100, 200, 200))


import numpy as np
arr = np.array(img_2)
print(arr.shape) # (H, W, 3)

# mask
mask = Image.new("RGB", img.size, 0)
mask_np = np.random.randint(0, 4, size=img.size[::-1], dtype=np.uint8)
print(mask_np.shape) # H x W

colors = np.array([
    [0,   0,   0  ],  # class 0: background (black)
    [255, 0,   0  ],  # class 1: red
    [0,   255, 0  ],  # class 2: green
    [0,   0,   255],  # class 3: blue
    [255, 255, 0 ],  # class 4: yellow
], dtype=np.uint8)

colors_arr = colors[mask_np]
img = Image.fromarray(colors_arr, mode="RGB")

# blend
vis = Image.blend(img, overlay_img, alpha=0.2)  # alpha controls transparency
vis.show()

# draw box and labels
from PIL import Image, ImageDraw, ImageFont

img = Image.open("sample.jpg").convert("RGB")
draw = ImageDraw.Draw(img)

# Example box: [xmin, ymin, xmax, ymax]
box = [50, 60, 200, 220]

# Draw rectangle
draw.rectangle(box, outline="red", width=3)

# Optional text label
label = "cat"
draw.text((box[0], box[1] - 10), label, fill="red")

img.show()

Video

主要用 opencv,

import cv2


cap = cv2.VideoCapture("input.mp4")  # 换成你的视频路径;也可以用 0 打开摄像头

fps = cap.get(cv2.CAP_PROP_FPS)
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore
writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h), isColor=True)

# write edges
while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 100, 200)
    cv2.imshow("edges", edges)

cv2.destroyAllWindows()

# add text and shapes
frame_idx = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    frame_idx += 1
    seconds = frame_idx / fps
    dt = 1.0 / fps
    cv2.putText(frame, f"frame_index: {frame_idx:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    cv2.putText(frame, f"seconds: {seconds:.2f}s", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    cv2.putText(frame, f"dt: {dt:.2f}s", (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    # draw a rectangle
    cv2.rectangle(frame, (0, 0), (350, 100), (0, 255, 0), 2)

    # 6) 显示
    cv2.imshow("frame", frame)