mirror of
				https://github.com/Wan-Video/Wan2.1.git
				synced 2025-11-04 06:15:17 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			284 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			284 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# -*- coding: UTF-8 -*-
 | 
						|
import os
 | 
						|
import cv2
 | 
						|
import numpy as np
 | 
						|
import torch
 | 
						|
import torchvision
 | 
						|
 | 
						|
 | 
						|
def xyxy2xywh(x):
 | 
						|
    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
 | 
						|
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
 | 
						|
    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
 | 
						|
    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
 | 
						|
    y[:, 2] = x[:, 2] - x[:, 0]  # width
 | 
						|
    y[:, 3] = x[:, 3] - x[:, 1]  # height
 | 
						|
    return y
 | 
						|
 | 
						|
 | 
						|
def xywh2xyxy(x):
 | 
						|
    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
 | 
						|
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
 | 
						|
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
 | 
						|
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
 | 
						|
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
 | 
						|
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
 | 
						|
    return y
 | 
						|
 | 
						|
 | 
						|
def box_iou(box1, box2):
 | 
						|
    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
 | 
						|
    """
 | 
						|
    Return intersection-over-union (Jaccard index) of boxes.
 | 
						|
    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
 | 
						|
    Arguments:
 | 
						|
        box1 (Tensor[N, 4])
 | 
						|
        box2 (Tensor[M, 4])
 | 
						|
    Returns:
 | 
						|
        iou (Tensor[N, M]): the NxM matrix containing the pairwise
 | 
						|
            IoU values for every element in boxes1 and boxes2
 | 
						|
    """
 | 
						|
 | 
						|
    def box_area(box):
 | 
						|
        # box = 4xn
 | 
						|
        return (box[2] - box[0]) * (box[3] - box[1])
 | 
						|
 | 
						|
    area1 = box_area(box1.T)
 | 
						|
    area2 = box_area(box2.T)
 | 
						|
 | 
						|
    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
 | 
						|
    inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) -
 | 
						|
             torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
 | 
						|
    # iou = inter / (area1 + area2 - inter)
 | 
						|
    return inter / (area1[:, None] + area2 - inter)
 | 
						|
 | 
						|
 | 
						|
def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
 | 
						|
    # Rescale coords (xyxy) from img1_shape to img0_shape
 | 
						|
    if ratio_pad is None:  # calculate from img0_shape
 | 
						|
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
 | 
						|
        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
 | 
						|
    else:
 | 
						|
        gain = ratio_pad[0][0]
 | 
						|
        pad = ratio_pad[1]
 | 
						|
 | 
						|
    coords[:, [0, 2]] -= pad[0]  # x padding
 | 
						|
    coords[:, [1, 3]] -= pad[1]  # y padding
 | 
						|
    coords[:, :4] /= gain
 | 
						|
    clip_coords(coords, img0_shape)
 | 
						|
    return coords
 | 
						|
 | 
						|
 | 
						|
def clip_coords(boxes, img_shape):
 | 
						|
    # Clip bounding xyxy bounding boxes to image shape (height, width)
 | 
						|
    boxes[:, 0].clamp_(0, img_shape[1])  # x1
 | 
						|
    boxes[:, 1].clamp_(0, img_shape[0])  # y1
 | 
						|
    boxes[:, 2].clamp_(0, img_shape[1])  # x2
 | 
						|
    boxes[:, 3].clamp_(0, img_shape[0])  # y2
 | 
						|
 | 
						|
 | 
						|
def scale_coords_landmarks(img1_shape, coords, img0_shape, ratio_pad=None):
 | 
						|
    # Rescale coords (xyxy) from img1_shape to img0_shape
 | 
						|
    if ratio_pad is None:  # calculate from img0_shape
 | 
						|
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
 | 
						|
        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
 | 
						|
    else:
 | 
						|
        gain = ratio_pad[0][0]
 | 
						|
        pad = ratio_pad[1]
 | 
						|
 | 
						|
    coords[:, [0, 2, 4, 6, 8]] -= pad[0]  # x padding
 | 
						|
    coords[:, [1, 3, 5, 7, 9]] -= pad[1]  # y padding
 | 
						|
    coords[:, :10] /= gain
 | 
						|
    #clip_coords(coords, img0_shape)
 | 
						|
    coords[:, 0].clamp_(0, img0_shape[1])  # x1
 | 
						|
    coords[:, 1].clamp_(0, img0_shape[0])  # y1
 | 
						|
    coords[:, 2].clamp_(0, img0_shape[1])  # x2
 | 
						|
    coords[:, 3].clamp_(0, img0_shape[0])  # y2
 | 
						|
    coords[:, 4].clamp_(0, img0_shape[1])  # x3
 | 
						|
    coords[:, 5].clamp_(0, img0_shape[0])  # y3
 | 
						|
    coords[:, 6].clamp_(0, img0_shape[1])  # x4
 | 
						|
    coords[:, 7].clamp_(0, img0_shape[0])  # y4
 | 
						|
    coords[:, 8].clamp_(0, img0_shape[1])  # x5
 | 
						|
    coords[:, 9].clamp_(0, img0_shape[0])  # y5
 | 
						|
    return coords
 | 
						|
 | 
						|
 | 
						|
def show_results(img, xywh, conf, landmarks, class_num):
 | 
						|
    h,w,c = img.shape
 | 
						|
    tl = 1 or round(0.002 * (h + w) / 2) + 1  # line/font thickness
 | 
						|
    x1 = int(xywh[0] * w - 0.5 * xywh[2] * w)
 | 
						|
    y1 = int(xywh[1] * h - 0.5 * xywh[3] * h)
 | 
						|
    x2 = int(xywh[0] * w + 0.5 * xywh[2] * w)
 | 
						|
    y2 = int(xywh[1] * h + 0.5 * xywh[3] * h)
 | 
						|
    cv2.rectangle(img, (x1,y1), (x2, y2), (0,255,0), thickness=tl, lineType=cv2.LINE_AA)
 | 
						|
 | 
						|
    clors = [(255,0,0),(0,255,0),(0,0,255),(255,255,0),(0,255,255)]
 | 
						|
 | 
						|
    for i in range(5):
 | 
						|
        point_x = int(landmarks[2 * i] * w)
 | 
						|
        point_y = int(landmarks[2 * i + 1] * h)
 | 
						|
        cv2.circle(img, (point_x, point_y), tl+1, clors[i], -1)
 | 
						|
 | 
						|
    tf = max(tl - 1, 1)  # font thickness
 | 
						|
    label = str(conf)[:5]
 | 
						|
    cv2.putText(img, label, (x1, y1 - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
 | 
						|
    return img
 | 
						|
 | 
						|
 | 
						|
def make_divisible(x, divisor):
 | 
						|
    # Returns x evenly divisible by divisor
 | 
						|
    return (x // divisor) * divisor
 | 
						|
 | 
						|
 | 
						|
def non_max_suppression_face(prediction, conf_thres=0.5, iou_thres=0.45, classes=None, agnostic=False, labels=()):
 | 
						|
    """Performs Non-Maximum Suppression (NMS) on inference results
 | 
						|
    Returns:
 | 
						|
         detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
 | 
						|
    """
 | 
						|
 | 
						|
    nc = prediction.shape[2] - 15  # number of classes
 | 
						|
    xc = prediction[..., 4] > conf_thres  # candidates
 | 
						|
 | 
						|
    # Settings
 | 
						|
    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
 | 
						|
    # time_limit = 10.0  # seconds to quit after
 | 
						|
    redundant = True  # require redundant detections
 | 
						|
    multi_label = nc > 1  # multiple labels per box (adds 0.5ms/img)
 | 
						|
    merge = False  # use merge-NMS
 | 
						|
 | 
						|
    # t = time.time()
 | 
						|
    output = [torch.zeros((0, 16), device=prediction.device)] * prediction.shape[0]
 | 
						|
    for xi, x in enumerate(prediction):  # image index, image inference
 | 
						|
        # Apply constraints
 | 
						|
        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
 | 
						|
        x = x[xc[xi]]  # confidence
 | 
						|
 | 
						|
        # Cat apriori labels if autolabelling
 | 
						|
        if labels and len(labels[xi]):
 | 
						|
            l = labels[xi]
 | 
						|
            v = torch.zeros((len(l), nc + 15), device=x.device)
 | 
						|
            v[:, :4] = l[:, 1:5]  # box
 | 
						|
            v[:, 4] = 1.0  # conf
 | 
						|
            v[range(len(l)), l[:, 0].long() + 15] = 1.0  # cls
 | 
						|
            x = torch.cat((x, v), 0)
 | 
						|
 | 
						|
        # If none remain process next image
 | 
						|
        if not x.shape[0]:
 | 
						|
            continue
 | 
						|
 | 
						|
        # Compute conf
 | 
						|
        x[:, 15:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
 | 
						|
 | 
						|
        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
 | 
						|
        box = xywh2xyxy(x[:, :4])
 | 
						|
 | 
						|
        # Detections matrix nx6 (xyxy, conf, landmarks, cls)
 | 
						|
        if multi_label:
 | 
						|
            i, j = (x[:, 15:] > conf_thres).nonzero(as_tuple=False).T
 | 
						|
            x = torch.cat((box[i], x[i, j + 15, None], x[i, 5:15] ,j[:, None].float()), 1)
 | 
						|
        else:  # best class only
 | 
						|
            conf, j = x[:, 15:].max(1, keepdim=True)
 | 
						|
            x = torch.cat((box, conf, x[:, 5:15], j.float()), 1)[conf.view(-1) > conf_thres]
 | 
						|
 | 
						|
        # Filter by class
 | 
						|
        if classes is not None:
 | 
						|
            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
 | 
						|
 | 
						|
        # If none remain process next image
 | 
						|
        n = x.shape[0]  # number of boxes
 | 
						|
        if not n:
 | 
						|
            continue
 | 
						|
 | 
						|
        # Batched NMS
 | 
						|
        c = x[:, 15:16] * (0 if agnostic else max_wh)  # classes
 | 
						|
        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
 | 
						|
        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
 | 
						|
        #if i.shape[0] > max_det:  # limit detections
 | 
						|
        #    i = i[:max_det]
 | 
						|
        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
 | 
						|
            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
 | 
						|
            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
 | 
						|
            weights = iou * scores[None]  # box weights
 | 
						|
            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
 | 
						|
            if redundant:
 | 
						|
                i = i[iou.sum(1) > 1]  # require redundancy
 | 
						|
 | 
						|
        output[xi] = x[i]
 | 
						|
        # if (time.time() - t) > time_limit:
 | 
						|
        #     break  # time limit exceeded
 | 
						|
 | 
						|
    return output
 | 
						|
 | 
						|
 | 
						|
class DetFace():
 | 
						|
    def __init__(self, pt_path, confThreshold=0.5, nmsThreshold=0.45, device='cuda'):
 | 
						|
        assert os.path.exists(pt_path)
 | 
						|
 | 
						|
        self.inpSize = 416
 | 
						|
        self.conf_thres = confThreshold
 | 
						|
        self.iou_thres = nmsThreshold
 | 
						|
        self.test_device = torch.device(device if torch.cuda.is_available() else "cpu")
 | 
						|
        self.model = torch.jit.load(pt_path).to(self.test_device)
 | 
						|
        self.last_w = 416
 | 
						|
        self.last_h = 416
 | 
						|
        self.grids = None
 | 
						|
 | 
						|
    @torch.no_grad()
 | 
						|
    def detect(self, srcimg):
 | 
						|
        # t0=time.time()
 | 
						|
 | 
						|
        h0, w0 = srcimg.shape[:2]  # orig hw
 | 
						|
        r = self.inpSize / min(h0, w0)  # resize image to img_size
 | 
						|
        h1 = int(h0*r+31)//32*32
 | 
						|
        w1 = int(w0*r+31)//32*32
 | 
						|
 | 
						|
        img = cv2.resize(srcimg, (w1,h1), interpolation=cv2.INTER_LINEAR)
 | 
						|
 | 
						|
        # Convert
 | 
						|
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # BGR to RGB
 | 
						|
 | 
						|
        # Run inference
 | 
						|
        img = torch.from_numpy(img).to(self.test_device).permute(2,0,1)
 | 
						|
        img = img.float()/255  # uint8 to fp16/32  0-1
 | 
						|
        if img.ndimension() == 3:
 | 
						|
            img = img.unsqueeze(0)
 | 
						|
 | 
						|
        # Inference
 | 
						|
        if h1 != self.last_h or w1 != self.last_w or self.grids is None:
 | 
						|
            grids = []
 | 
						|
            for scale in [8,16,32]:
 | 
						|
                ny = h1//scale
 | 
						|
                nx = w1//scale
 | 
						|
                yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
 | 
						|
                grid = torch.stack((xv, yv), 2).view((1,1,ny, nx, 2)).float()
 | 
						|
                grids.append(grid.to(self.test_device))
 | 
						|
            self.grids = grids
 | 
						|
            self.last_w = w1
 | 
						|
            self.last_h = h1
 | 
						|
 | 
						|
        pred = self.model(img, self.grids).cpu()
 | 
						|
 | 
						|
        # Apply NMS
 | 
						|
        det = non_max_suppression_face(pred, self.conf_thres, self.iou_thres)[0]
 | 
						|
        # Process detections
 | 
						|
        # det = pred[0]
 | 
						|
        bboxes = np.zeros((det.shape[0], 4))
 | 
						|
        kpss = np.zeros((det.shape[0], 5, 2))
 | 
						|
        scores = np.zeros((det.shape[0]))
 | 
						|
        # gn = torch.tensor([w0, h0, w0, h0]).to(pred)  # normalization gain whwh
 | 
						|
        # gn_lks = torch.tensor([w0, h0, w0, h0, w0, h0, w0, h0, w0, h0]).to(pred)  # normalization gain landmarks
 | 
						|
        det = det.cpu().numpy()
 | 
						|
 | 
						|
        for j in range(det.shape[0]):
 | 
						|
            # xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(4).cpu().numpy()
 | 
						|
            bboxes[j, 0] = det[j, 0] * w0/w1
 | 
						|
            bboxes[j, 1] = det[j, 1] * h0/h1
 | 
						|
            bboxes[j, 2] = det[j, 2] * w0/w1 - bboxes[j, 0]
 | 
						|
            bboxes[j, 3] = det[j, 3] * h0/h1 - bboxes[j, 1]
 | 
						|
            scores[j] = det[j, 4]
 | 
						|
            # landmarks = (det[j, 5:15].view(1, 10) / gn_lks).view(5,2).cpu().numpy()
 | 
						|
            kpss[j, :, :] = det[j, 5:15].reshape(5, 2) * np.array([[w0/w1,h0/h1]])
 | 
						|
                # class_num = det[j, 15].cpu().numpy()
 | 
						|
                # orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)
 | 
						|
        return bboxes, kpss, scores
 |