import os
import sys
import torch
from torch.utils.data import Dataset, DataLoader
import config.yolov3_config_voc as cfg
import cv2
import numpy as np
import random
# from . import data_augment as dataAug# from . import toolsimport utils.data_augment as dataAug
import utils.tools as tools
sys.path.append("..")
sys.path.append("../utils")classVocDataset(Dataset):def__init__(self, anno_file_type, img_size=416):
self.img_size = img_size # For Multi-training
self.classes = cfg.DATA["CLASSES"]
self.num_classes =len(self.classes)
self.class_to_id =dict(zip(self.classes,range(self.num_classes)))
self.__annotations = self.__load_annotations(anno_file_type)def__len__(self):returnlen(self.__annotations)def__getitem__(self, item):
img_org, bboxes_org = self.__parse_annotation(self.__annotations[item])
img_org = img_org.transpose(2,0,1)# HWC->CHW
item_mix = random.randint(0,len(self.__annotations)-1)
img_mix, bboxes_mix = self.__parse_annotation(self.__annotations[item_mix])
img_mix = img_mix.transpose(2,0,1)
img, bboxes = dataAug.Mixup()(img_org, bboxes_org, img_mix, bboxes_mix)del img_org, bboxes_org, img_mix, bboxes_mix
label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = self.__creat_label(bboxes)
img = torch.from_numpy(img).float()
label_sbbox = torch.from_numpy(label_sbbox).float()
label_mbbox = torch.from_numpy(label_mbbox).float()
label_lbbox = torch.from_numpy(label_lbbox).float()
sbboxes = torch.from_numpy(sbboxes).float()
mbboxes = torch.from_numpy(mbboxes).float()
lbboxes = torch.from_numpy(lbboxes).float()return img, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes
def__load_annotations(self, anno_type):assert anno_type in['train','test'],"You must choice one of the 'train' or 'test' for anno_type parameter"
anno_path = os.path.join(cfg.PROJECT_PATH,'data', anno_type+"_annotation.txt")withopen(anno_path,'r')as f:
annotations =list(filter(lambda x:len(x)>0, f.readlines()))assertlen(annotations)>0,"No images found in {}".format(anno_path)return annotations
def__parse_annotation(self, annotation):"""
Data augument.
:param annotation: Image' path and bboxes' coordinates, categories.
ex. [image_path xmin,ymin,xmax,ymax,class_ind xmin,ymin,xmax,ymax,class_ind ...]
:return: Return the enhanced image and bboxes. bbox'shape is [xmin, ymin, xmax, ymax, class_ind]
"""
anno = annotation.strip().split(' ')
img_path = anno[0]
img = cv2.imread(img_path)# H*W*C and C=BGRassert img isnotNone,'File Not Found '+ img_path
bboxes = np.array([list(map(float, box.split(',')))for box in anno[1:]])
img, bboxes = dataAug.RandomHorizontalFilp()(np.copy(img), np.copy(bboxes))
img, bboxes = dataAug.RandomCrop()(np.copy(img), np.copy(bboxes))
img, bboxes = dataAug.RandomAffine()(np.copy(img), np.copy(bboxes))
img, bboxes = dataAug.Resize((self.img_size, self.img_size),True)(np.copy(img), np.copy(bboxes))return img, bboxes
def__creat_label(self, bboxes):"""
Label assignment. For a single picture all GT box bboxes are assigned anchor.
1、Select a bbox in order, convert its coordinates("xyxy") to "xywh"; and scale bbox'
xywh by the strides.
2、Calculate the iou between the each detection layer'anchors and the bbox in turn, and select the largest
anchor to predict the bbox.If the ious of all detection layers are smaller than 0.3, select the largest
of all detection layers' anchors to predict the bbox.
Note :
1、The same GT may be assigned to multiple anchors. And the anchors may be on the same or different layer.
2、The total number of bboxes may be more than it is, because the same GT may be assigned to multiple layers
of detection.
"""
anchors = np.array(cfg.MODEL["ANCHORS"])
strides = np.array(cfg.MODEL["STRIDES"])
train_output_size = self.img_size / strides
anchors_per_scale = cfg.MODEL["ANCHORS_PER_SCLAE"]
label =[np.zeros((int(train_output_size[i]),int(train_output_size[i]),
anchors_per_scale,6+self.num_classes))for i inrange(3)]for i inrange(3):
label[i][...,5]=1.0
bboxes_xywh =[np.zeros((150,4))for _ inrange(3)]# Darknet the max_num is 30
bbox_count = np.zeros((3,))for bbox in bboxes:
bbox_coor = bbox[:4]
bbox_class_ind =int(bbox[4])
bbox_mix = bbox[5]# onehot
one_hot = np.zeros(self.num_classes, dtype=np.float32)
one_hot[bbox_class_ind]=1.0
one_hot_smooth = dataAug.LabelSmooth()(one_hot, self.num_classes)# convert "xyxy" to "xywh"
bbox_xywh = np.concatenate([(bbox_coor[2:]+ bbox_coor[:2])*0.5,
bbox_coor[2:]- bbox_coor[:2]], axis=-1)# print("bbox_xywh: ", bbox_xywh)
bbox_xywh_scaled =1.0* bbox_xywh[np.newaxis,:]/ strides[:, np.newaxis]
iou =[]
exist_positive =Falsefor i inrange(3):
anchors_xywh = np.zeros((anchors_per_scale,4))
anchors_xywh[:,0:2]= np.floor(bbox_xywh_scaled[i,0:2]).astype(np.int32)+0.5# 0.5 for compensation
anchors_xywh[:,2:4]= anchors[i]
iou_scale = tools.iou_xywh_numpy(bbox_xywh_scaled[i][np.newaxis,:], anchors_xywh)
iou.append(iou_scale)
iou_mask = iou_scale >0.3if np.any(iou_mask):
xind, yind = np.floor(bbox_xywh_scaled[i,0:2]).astype(np.int32)# Bug : 当多个bbox对应同一个anchor时,默认将该anchor分配给最后一个bbox
label[i][yind, xind, iou_mask,0:4]= bbox_xywh
label[i][yind, xind, iou_mask,4:5]=1.0
label[i][yind, xind, iou_mask,5:6]= bbox_mix
label[i][yind, xind, iou_mask,6:]= one_hot_smooth
bbox_ind =int(bbox_count[i]%150)# BUG : 150为一个先验值,内存消耗大
bboxes_xywh[i][bbox_ind,:4]= bbox_xywh
bbox_count[i]+=1
exist_positive =Trueifnot exist_positive:
best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1)
best_detect =int(best_anchor_ind / anchors_per_scale)
best_anchor =int(best_anchor_ind % anchors_per_scale)
xind, yind = np.floor(bbox_xywh_scaled[best_detect,0:2]).astype(np.int32)
label[best_detect][yind, xind, best_anchor,0:4]= bbox_xywh
label[best_detect][yind, xind, best_anchor,4:5]=1.0
label[best_detect][yind, xind, best_anchor,5:6]= bbox_mix
label[best_detect][yind, xind, best_anchor,6:]= one_hot_smooth
bbox_ind =int(bbox_count[best_detect]%150)
bboxes_xywh[best_detect][bbox_ind,:4]= bbox_xywh
bbox_count[best_detect]+=1
label_sbbox, label_mbbox, label_lbbox = label
sbboxes, mbboxes, lbboxes = bboxes_xywh
return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes
if __name__ =="__main__":
voc_dataset = VocDataset(anno_file_type="train", img_size=448)
dataloader = DataLoader(voc_dataset, shuffle=True, batch_size=1, num_workers=0)for i,(img, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes)inenumerate(dataloader):if i==0:print(img.shape)print(label_sbbox.shape)print(label_mbbox.shape)print(label_lbbox.shape)print(sbboxes.shape)print(mbboxes.shape)print(lbboxes.shape)if img.shape[0]==1:
labels = np.concatenate([label_sbbox.reshape(-1,26), label_mbbox.reshape(-1,26),
label_lbbox.reshape(-1,26)], axis=0)
labels_mask = labels[...,4]>0
labels = np.concatenate([labels[labels_mask][...,:4], np.argmax(labels[labels_mask][...,6:],
axis=-1).reshape(-1,1)], axis=-1)print(labels.shape)
tools.plot_box(labels, img,id=1)
VOC2CSV
import os
import random
import math
import argparse
from tqdm import tqdm
import xml.etree.ElementTree as ET
defparse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-i","--indir",type="str", default="")
parser.add_argument("-p","--percent",type=float, default=0.2)
parser.add_argument("-t","--train",type=str, default="")
parser.add_argument("-v","--val",type=str, default="")
parser.add_argument("-c","--classes",type=str, default="")
args = parser.parse_args()return args
# 获取特定后缀名的文件列表,以list的形式返回defget_file_index(indir, postfix):print(indir)
file_list =[]for root, dirs, files in os.walk(indir):for name in files:if postfix in name:
file_list.append(os.path.join(root, name))return file_list
# 写入标注信息defconvert_annotation(csv, address_list):
cls_list =[]withopen(csv,"w")as f:for i, address inenumerate(tqdm(address_list)):
in_file =open(address, encoding="utf-8")
strXmml = in_file.read()
in_file.close()
root = ET.XML(in_file)for obj in root.iter("object"):
cls = obj.find("name").text
cls_list.append(cls)
xmlbox = obj.find("bndbox")# 从xml文件中获取bbox的四个值,并转化为int类型
b =(int(float(xmlbox.find("xmin").text)),int(float(xmlbox.find("ymin").text)),int(float(xmlbox.find("xmax").text)),int(float(xmlbox.find("ymax").text)))
f.write(file_dict[address_list[i]])
f.write(","+",".join([str(a)for a in b])+","+cls)
f.write("\n")return cls_list
if __name__ =="__main__":
args = parse_args()
file_address = args.indir
test_percent = args.percent
train_csv = args.train
test_csv = args.val
class_csv = args.classes
Annotarions = get_file_index(file_address+"/Annotations",".xml")
Annotarions.sort()
JPEGfiles = get_file_index(file_address+"/JPEGImages",".jpg")
JPEGfiles.sort()assertlen(Annotarions)==len(JPEGfiles)
file_dict =dict(zip(Annotarions, JPEGfiles))
num =len(Annotarions)
test = random.sample(k=math.cell(num*test_percent), population=Annotarions)
train =list(set(Annotarions)-set(test))
cls_lsit1 = convert_annotation(train_csv, train)
cls_lsit2 = convert_annotation(test_csv, test)
cls_unique =list(set(cls_lsit1+cls_lsit2))withopen(class_csv,"w")as f:for i, cls inenumerate(cls_unique):
f.write(cls+","+str(i)+"\n")
Yolov3配置文件
# 数据集文件路径;项目工程文件路径
DATA_PATH ="./data/VOC"
PROJECT_PATH =r"E:/CV/CV-图像检测/yolov3"# 标签列表及数目
DATA ={"CLASSES":['aeroplane','bicycle','bird','boat','bottle','bus','car','cat','chair','cow','diningtable','dog','horse','motorbike','person','pottedplant','sheep','sofa','train','tvmonitor'],"NUM":20}# model 锚框;边界大小;每单元锚框数
MODEL ={"ANCHORS":[[(1.25,1.625),(2.0,3.75),(4.125,2.875)],# Anchors for small obj[(1.875,3.8125),(3.875,2.8125),(3.6875,7.4375)],# Anchors for medium obj[(3.625,2.8125),(4.875,6.1875),(11.65625,10.1875)]],# Anchors for big obj"STRIDES":[8,16,32],"ANCHORS_PER_SCLAE":3}# train配置文件
TRAIN ={"TRAIN_IMG_SIZE":448,"AUGMENT":True,"BATCH_SIZE":4,"MULTI_SCALE_TRAIN":True,"IOU_THRESHOLD_LOSS":0.5,"EPOCHS":50,"NUMBER_WORKERS":4,"MOMENTUM":0.9,"WEIGHT_DECAY":0.0005,"LR_INIT":1e-4,"LR_END":1e-6,"WARMUP_EPOCHS":2# or None}# test配置文件
TEST ={"TEST_IMG_SIZE":448,"BATCH_SIZE":4,"NUMBER_WORKERS":2,"CONF_THRESH":0.01,"NMS_THRESH":0.5,"MULTI_SCALE_TEST":False,"FLIP_TEST":False}
import torch
import torch.nn as nn
import torch.nn.functional as F
from.activate import*
norm_name ={"bn": nn.BatchNorm2d}
activate_name ={"relu": nn.ReLU,"leaky": nn.LeakyReLU,"mish": Mish}classConvolutional(nn.Module):def__init__(self, filters_in, filters_out, kernel_size, stride, pad, norm=None, activate=None):super(Convolutional, self).__init__()
self.norm = norm
self.activate = activate
self.__conv = nn.Conv2d(in_channels=filters_in, out_channels=filters_out, kernel_size=kernel_size,
stride=stride, padding=pad, bias=not norm)if norm:assert norm in norm_name.keys()if norm =="bn":
self.__norm = norm_name[norm](num_features=filters_out)if activate:assert activate in activate_name.keys()if activate =="leaky":
self.__activate = activate_name[activate](negative_slope=0.1, inplace=True)if activate =="relu":
self.__activate = activate_name[activate](inplace=True)defforward(self, x):
x = self.__conv(x)if self.norm:
x = self.__norm(x)if self.activate:
x = self.__activate(x)return x
残差模块
import torch.nn as nn
from..layers.conv_module import Convolutional
classResidual_block(nn.Module):def__init__(self, filters_in, filters_out, filters_medium):super(Residual_block, self).__init__()
self.__conv1 = Convolutional(filters_in=filters_in, filters_out=filters_medium, kernel_size=1, stride=1, pad=0,
norm="bn", activate="leaky")
self.__conv2 = Convolutional(filters_in=filters_medium, filters_out=filters_out, kernel_size=3, stride=1, pad=1,
norm="bn", activate="leaky")defforward(self, x):
r = self.__conv1(x)
r = self.__conv2(r)
out = x + r
return out
激活函数模块
import torch
import torch.nn as nn
import torch.nn.functional as F
classMish(nn.Module):def__init__(self):super(Mish).__init__()defforward(self, x):
x = x *(torch.tanh(F.softplus(x)))return x
classSwish(nn.Module):def__init__(self):super(Swish, self).__init__()defforward(self, x):
x = x * F.sigmoid(x)return x
损失函数模块
import sys
sys.path.append("../utils")import torch
import torch.nn as nn
from utils import tools
import config.yolov3_config_voc as cfg
classFocalLoss(nn.Module):def__init__(self, gamma=2.0, alpha=1.0, reduction="mean"):super(FocalLoss, self).__init__()
self.__gamma = gamma
self.__alpha = alpha
self.__loss = nn.BCEWithLogitsLoss(reduction=reduction)defforward(self,input, target):
loss = self.__loss(input=input, target=target)
loss *= self.__alpha * torch.pow(torch.abs(target - torch.sigmoid(input)), self.__gamma)return loss
classYoloV3Loss(nn.Module):def__init__(self, anchors, strides, iou_threshold_loss=0.5):super(YoloV3Loss, self).__init__()
self.__iou_threshold_loss = iou_threshold_loss
self.__strides = strides
defforward(self, p, p_d, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes):"""
:param p: Predicted offset values for three detection layers.
The shape is [p0, p1, p2], ex. p0=[bs, grid, grid, anchors, tx+ty+tw+th+conf+cls_20]
:param p_d: Decodeed predicted value. The size of value is for image size.
ex. p_d0=[bs, grid, grid, anchors, x+y+w+h+conf+cls_20]
:param label_sbbox: Small detection layer's label. The size of value is for original image size.
shape is [bs, grid, grid, anchors, x+y+w+h+conf+mix+cls_20]
:param label_mbbox: Same as label_sbbox.
:param label_lbbox: Same as label_sbbox.
:param sbboxes: Small detection layer bboxes.The size of value is for original image size.
shape is [bs, 150, x+y+w+h]
:param mbboxes: Same as sbboxes.
:param lbboxes: Same as sbboxes
"""
strides = self.__strides
loss_s, loss_s_giou, loss_s_conf, loss_s_cls = self.__cal_loss_per_layer(p[0], p_d[0], label_sbbox,
sbboxes, strides[0])
loss_m, loss_m_giou, loss_m_conf, loss_m_cls = self.__cal_loss_per_layer(p[1], p_d[1], label_mbbox,
mbboxes, strides[1])
loss_l, loss_l_giou, loss_l_conf, loss_l_cls = self.__cal_loss_per_layer(p[2], p_d[2], label_lbbox,
lbboxes, strides[2])
loss = loss_l + loss_m + loss_s
loss_giou = loss_s_giou + loss_m_giou + loss_l_giou
loss_conf = loss_s_conf + loss_m_conf + loss_l_conf
loss_cls = loss_s_cls + loss_m_cls + loss_l_cls
return loss, loss_giou, loss_conf, loss_cls
def__cal_loss_per_layer(self, p, p_d, label, bboxes, stride):"""
(1)The loss of regression of boxes.
GIOU loss is defined in https://arxiv.org/abs/1902.09630.
Note: The loss factor is 2-w*h/(img_size**2), which is used to influence the
balance of the loss value at different scales.
(2)The loss of confidence.
Includes confidence loss values for foreground and background.
Note: The backgroud loss is calculated when the maximum iou of the box predicted
by the feature point and all GTs is less than the threshold.
(3)The loss of classes。
The category loss is BCE, which is the binary value of each class.
:param stride: The scale of the feature map relative to the original image
:return: The average loss(loss_giou, loss_conf, loss_cls) of all batches of this detection layer.
"""
BCE = nn.BCEWithLogitsLoss(reduction="none")
FOCAL = FocalLoss(gamma=2, alpha=1.0, reduction="none")
batch_size, grid = p.shape[:2]
img_size = stride * grid
p_conf = p[...,4:5]
p_cls = p[...,5:]
p_d_xywh = p_d[...,:4]
label_xywh = label[...,:4]
label_obj_mask = label[...,4:5]
label_cls = label[...,6:]
label_mix = label[...,5:6]# loss giou
giou = tools.GIOU_xywh_torch(p_d_xywh, label_xywh).unsqueeze(-1)# The scaled weight of bbox is used to balance the impact of small objects and large objects on loss.
bbox_loss_scale =2.0-1.0* label_xywh[...,2:3]* label_xywh[...,3:4]/(img_size **2)
loss_giou = label_obj_mask * bbox_loss_scale *(1.0- giou)* label_mix
# loss confidence
iou = tools.iou_xywh_torch(p_d_xywh.unsqueeze(4), bboxes.unsqueeze(1).unsqueeze(1).unsqueeze(1))
iou_max = iou.max(-1, keepdim=True)[0]
label_noobj_mask =(1.0- label_obj_mask)*(iou_max < self.__iou_threshold_loss).float()
loss_conf =(label_obj_mask * FOCAL(input=p_conf, target=label_obj_mask)+
label_noobj_mask * FOCAL(input=p_conf, target=label_obj_mask))* label_mix
# loss classes
loss_cls = label_obj_mask * BCE(input=p_cls, target=label_cls)* label_mix
loss_giou =(torch.sum(loss_giou))/ batch_size
loss_conf =(torch.sum(loss_conf))/ batch_size
loss_cls =(torch.sum(loss_cls))/ batch_size
loss = loss_giou + loss_conf + loss_cls
return loss, loss_giou, loss_conf, loss_cls