后续处理多模数据会比较多,还在更新中...(应该会涉及到很多使用模型的处理链路)
Image
主要用 PIL 这个库
img = Image.open("sample.jpg")
print(img.size)
print(img.mode) # RGB, GRAY
# resize,等比例缩放,但一般不常用,因为会破坏 obj 形状
img_2 = img.resize((224, 224))
# crop
img_2 = img_2.crop((100, 100, 200, 200))
import numpy as np
arr = np.array(img_2)
print(arr.shape) # (H, W, 3)
# mask
mask = Image.new("RGB", img.size, 0)
mask_np = np.random.randint(0, 4, size=img.size[::-1], dtype=np.uint8)
print(mask_np.shape) # H x W
colors = np.array([
[0, 0, 0 ], # class 0: background (black)
[255, 0, 0 ], # class 1: red
[0, 255, 0 ], # class 2: green
[0, 0, 255], # class 3: blue
[255, 255, 0 ], # class 4: yellow
], dtype=np.uint8)
colors_arr = colors[mask_np]
img = Image.fromarray(colors_arr, mode="RGB")
# blend
vis = Image.blend(img, overlay_img, alpha=0.2) # alpha controls transparency
vis.show()
# draw box and labels
from PIL import Image, ImageDraw, ImageFont
img = Image.open("sample.jpg").convert("RGB")
draw = ImageDraw.Draw(img)
# Example box: [xmin, ymin, xmax, ymax]
box = [50, 60, 200, 220]
# Draw rectangle
draw.rectangle(box, outline="red", width=3)
# Optional text label
label = "cat"
draw.text((box[0], box[1] - 10), label, fill="red")
img.show()
Video
主要用 opencv,
import cv2
cap = cv2.VideoCapture("input.mp4") # 换成你的视频路径;也可以用 0 打开摄像头
fps = cap.get(cv2.CAP_PROP_FPS)
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore
writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h), isColor=True)
# write edges
while True:
ret, frame = cap.read()
if not ret:
break
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 100, 200)
cv2.imshow("edges", edges)
cv2.destroyAllWindows()
# add text and shapes
frame_idx = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
frame_idx += 1
seconds = frame_idx / fps
dt = 1.0 / fps
cv2.putText(frame, f"frame_index: {frame_idx:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.putText(frame, f"seconds: {seconds:.2f}s", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.putText(frame, f"dt: {dt:.2f}s", (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
# draw a rectangle
cv2.rectangle(frame, (0, 0), (350, 100), (0, 255, 0), 2)
# 6) 显示
cv2.imshow("frame", frame)