多模态-文本提示检测图像GroundingDINO运行

1. git clone

git clone https://github.com/IDEA-Research/GroundingDINO.git

也可手动下载：https://github.com/IDEA-Research/GroundingDINO

2. 安装所需的依赖项

pip install -e .

我用该方法老是报错或者直接卡掉，选用第二种方法：

python setup.py install

3. 下载预训练权重

wget https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth

4. 创建一个新的python文件

命名为：grounding_dino_demo.py 放在GroundingDINO目录下，其代码如下:

from groundingdino.util.inference import load_model, load_image, predict, annotate, Model
import cv2
CONFIG_PATH = "groundingdino/config/GroundingDINO_SwinT_OGC.py"    #源码自带的配置文件
CHECKPOINT_PATH = "./weights/groundingdino_swint_ogc.pth"   #下载的权重文件
DEVICE = "cuda"   #可以选择cpu/cuda
IMAGE_PATH = "/home/code/wll/code/GroundingDINO-main/asset/cat_dog.jpeg"    #用户设置的需要读取image的路径
TEXT_PROMPT = "dog"    #给出的文本提示
BOX_TRESHOLD = 0.35     #源码给定的边界框判定阈值
TEXT_TRESHOLD = 0.25    #源码给定的文本端获取关键属性阈值
image_source, image = load_image(IMAGE_PATH)
model = load_model(CONFIG_PATH, CHECKPOINT_PATH)
boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD,
    device=DEVICE,
)
annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
cv2.imwrite("./result_image/dog.jpg", annotated_frame)

5. 运行grounding_dino_demo.py

会报错：

原因：huggingface 貌似被墙了，所以不能在线下载bert预训练模型，只能离线下载

解决：

（1）下载betr-base-uncased模型

进入网站：，

需下载的文件如下：

（2）修改groundingdino/util/get_tokenlizer.py

from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast
import os


# 获取指定文本编码器类型的分词器
def get_tokenlizer(text_encoder_type, local_files_only=True):
    # 检查text_encoder_type是否为字符串
    if not isinstance(text_encoder_type, str):
        # 如果不是字符串，尝试从对象中获取text_encoder_type属性
        if hasattr(text_encoder_type, "text_encoder_type"):
            text_encoder_type = text_encoder_type.text_encoder_type
        # 如果字典中存在"text_encoder_type"键，则从字典中获取
        elif text_encoder_type.get("text_encoder_type", False):
            text_encoder_type = text_encoder_type.get("text_encoder_type")
        # 如果是目录且存在，则保持不变
        elif os.path.isdir(text_encoder_type) and os.path.exists(text_encoder_type):
            pass
        else:
            # 如果无法确定text_encoder_type类型，则引发错误
            raise ValueError(
                "Unknown type of text_encoder_type: {}".format(type(text_encoder_type))
            )
    print("final text_encoder_type: {}".format(text_encoder_type))

    # 使用transformers库中的AutoTokenizer根据text_encoder_type加载分词器
    tokenizer = AutoTokenizer.from_pretrained(text_encoder_type, local_files_only=local_files_only)
    return tokenizer


# 获取指定预训练语言模型的模型实例
def get_pretrained_language_model(text_encoder_type, local_files_only=True):
    # 根据text_encoder_type选择合适的预训练语言模型
    if text_encoder_type == "bert-base-uncased" or (
            os.path.isdir(text_encoder_type) and os.path.exists(text_encoder_type)):
        # 如果是BERT模型，则使用BertModel加载
        return BertModel.from_pretrained(text_encoder_type, local_files_only=local_files_only)
    elif text_encoder_type == "roberta-base":
        # 如果是RoBERTa模型，则使用RobertaModel加载
        return RobertaModel.from_pretrained(text_encoder_type, local_files_only=local_files_only)

    # 如果text_encoder_type不是已知的模型类型，则引发错误
    raise ValueError("Unknown text_encoder_type {}".format(text_encoder_type))

再次运行grounding_dino_demo.py，得到如下检测效果：