Bootstrap

利用Grounding DINO进行自动标注——目标检测任务——YOLO格式

关于Grounding DINO的环境搭建可以参考我的以前的博客,链接如下所示

如何在Linux上离线部署Grounding DINO-CSDN博客

这个博客主要来介绍如何利用Grounding DINO这个项目去进行目标检测的自动化标注。并且给出了相关的代码已经实验验证。

1.数据集准备

 

 2. 开始实验

2.1 批量标注参考代码如下:

import os
import cv2
import torch
from torchvision.ops import box_convert
from groundingdino.util.inference import load_model, load_image, predict, annotate

# 配置路径
MODEL_CONFIG_PATH = "groundingdino/config/GroundingDINO_SwinT_OGC.py"
MODEL_WEIGHTS_PATH = "weights/groundingdino_swint_ogc.pth"
PROJECT_ROOT="Auto_label/Project1/" # 自动检测的根路径
IMAGE_FOLDER = PROJECT_ROOT + "images"  # 输入图片文件夹
OUTPUT_FOLDER = PROJECT_ROOT + "detect_results"  # 输出标注图片的文件夹
LABELS_FOLDER = PROJECT_ROOT + "labels"  # 输出YOLO标签的文件夹
CLASSES_FILE = PROJECT_ROOT + "classes.txt"  # 类别文件

# YOLO标签格式转换函数
def convert_to_yolo_format(xyxy, image_width, image_height):
    """
    将 `xyxy` 坐标转换为 YOLO 格式的 `x_center, y_center, width, height`
    """
    x_min, y_min, x_max, y_max = xyxy
    x_center = (x_min + x_max) / 2.0 / image_width
    y_center = (y_min + y_max) / 2.0 / image_height
    width = abs(x_max - x_min) / image_width
    height = abs(y_max - y_min) / image_height
    return x_center, y_center, width, height

# 加载类别文件
def load_classes(classes_file):
    with open(classes_file, "r") as f:
        return [line.strip() for line in f.readlines()]

# 主检测与标签生成函数
def process_images(model, classes, image_folder, output_folder, labels_folder):
    os.makedirs(output_folder, exist_ok=True)
    os.makedirs(labels_folder, exist_ok=True)

    for image_file in os.listdir(image_folder):
        if not image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
            continue

        # 加载图片
        image_path = os.path.join(image_folder, image_file)
        image_source, image = load_image(image_path)
        h, w, _ = image_source.shape

        # 推理检测
        boxes, logits, phrases = predict(
            model=model,
            image=image,
            # caption="car . coach . bus . truck . tricycle . person . twowheelsvehicle . taxi . license_plate . other_vehicles",
            caption="person . car . dog . cat",
            box_threshold=0.35, #0.35
            text_threshold=0.25 # 0.25
        )

        # 缩放坐标并转换为 `xyxy`
        yolo_boxes = boxes * torch.Tensor([w, h, w, h])
        xyxy_boxes = box_convert(boxes=yolo_boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()

        # 创建YOLO标签文件
        label_file = os.path.join(labels_folder, os.path.splitext(image_file)[0] + ".txt")
        with open(label_file, "w") as label_f:
            for xyxy, phrase in zip(xyxy_boxes, phrases):
                # 获取类别索引
                class_idx = classes.index(phrase) if phrase in classes else -1
                if class_idx == -1:
                    continue  # 跳过不在类别文件中的目标
                
                # 转换坐标格式
                x_center, y_center, width, height = convert_to_yolo_format(xyxy, w, h)
                
                # 写入YOLO标签文件
                label_f.write(f"{class_idx} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")

        # 标注图片并保存
        annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
        output_image_path = os.path.join(output_folder, image_file)
        cv2.imwrite(output_image_path, annotated_frame)

        print(f"Processed {image_file}, labels saved to {label_file}, annotated image saved to {output_image_path}")

# 主函数
if __name__ == "__main__":
    # 加载模型和类别
    model = load_model(MODEL_CONFIG_PATH, MODEL_WEIGHTS_PATH)
    classes = load_classes(CLASSES_FILE)

    # 处理图片并生成标签
    process_images(model, classes, IMAGE_FOLDER, OUTPUT_FOLDER, LABELS_FOLDER)

上面的注释非常详细了,就不过多赘述了。

主要根据自己的环境修改以下内容

  1. 配置路径
  2. 提示词

2.2 开始实验

我主要想检测,因此我的提升词设置如下:

caption="person . car . dog . cat",

检测结果

Auto_label/Project1/detect_results文件夹

 标签文件

Auto_label/Project1/labels文件夹

2.3 实验验证

分析获得标签是否正确,可以可视化标签,可视化标签代码可以参考我以前的博客,链接如下所示

目标检测-可视化YOLO格式标签_yolo标签可视化-CSDN博客

 非常的完美!!!

最近有更新了Grounding DINO-X,其效果更好,链接如下

GitHub - IDEA-Research/DINO-X-API: DINO-X: The World's Top-Performing Vision Model for Open-World Object Detection and Understanding

;