目标检测tricks(基于detectron2)
正确尝试
裁剪
由于目标相对于整张图片来说过小,所以对数据进行裁剪(除了裁剪尺寸还需要关注重叠尺寸,重叠尺寸稍微大一些,尽量保持每个目标有完整的存在,不至于因裁剪而破坏目标,这里设置裁剪512,重叠256)
参考代码:DOTA_devkit
改变anchor size和aspect_ratio
由于数据目标较小,所以需要更改detectron2里默认的anchor.size和aspect_ratio
cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[35], [68], [87], [130], [149]]
cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[1.1], [1.2], [1.4], [1.8], [2.7]]
做法:统计数据标注框的面积和长宽比例,分别通过kmeans聚类方法得出结果。这里使用了sklearn包带的kmeans和网上手写的kmeans代码进行了比较,认为应该是sklearn自带的kmeans得到的聚类结果更能覆盖整体数据,更符合这里目标检测anchor的需求。
加入TTA
测试时数据增强,简称TTA,是对测试数据集进行数据扩展的一种应用,涉及到为测试集中的每个图像创建多个扩增副本,让模型对每个图像做出预测,然后返回这些预测的集合。
cfg.TEST.AUG.ENABLED = True
cfg.TEST.AUG.MIN_SIZES = (400, 500, 512, 600, 700, 800)
cfg.TEST.AUG.MAX_SIZE = 1000
cfg.TEST.AUG.FLIP = True
由于目标检测使用的旋转框五参数格式( x , y , w , h , θ x,y,w,h,\theta x,y,w,h,θ),所以对detectron2自带的TTA做了一些改动,主要是apply_box更改为apply_rotated_box以及fast_rcnn_inference_single_image_rotated
class GeneralizedRCNNWithTTA(nn.Module):
"""
A GeneralizedRCNN with test-time augmentation enabled.
Its :meth:`__call__` method has the same interface as :meth:`GeneralizedRCNN.forward`.
"""
def __init__(self, cfg, model, tta_mapper=None, batch_size=3):
"""
Args:
cfg (CfgNode):
model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
tta_mapper (callable): takes a dataset dict and returns a list of
augmented versions of the dataset dict. Defaults to
`DatasetMapperTTA(cfg)`.
batch_size (int): batch the augmented images into this batch size for inference.
"""
super().__init__()
if isinstance(model, DistributedDataParallel):
model = model.module
assert isinstance(
model, GeneralizedRCNN
), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model))
self.cfg = cfg.clone()
assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet"
assert (
not self.cfg.MODEL.LOAD_PROPOSALS
), "TTA for pre-computed proposals is not supported yet"
self.model = model
if tta_mapper is None:
tta_mapper = DatasetMapperTTA(cfg.TEST.AUG.MIN_SIZES, cfg.TEST.AUG.MAX_SIZE, cfg.TEST.AUG.FLIP)
self.tta_mapper = tta_mapper
self.batch_size = batch_size
@contextmanager
def _turn_off_roi_heads(self, attrs):
"""
Open a context where some heads in `model.roi_heads` are temporarily turned off.
Args:
attr (list[str]): the attribute in `model.roi_heads` which can be used
to turn off a specific head, e.g., "mask_on", "keypoint_on".
"""
roi_heads = self.model.roi_heads
old = {}
for attr in attrs:
try:
old[attr] = getattr(roi_heads, attr)
except AttributeError:
# The head may not be implemented in certain ROIHeads
pass
if len(old.keys()) == 0:
yield
else:
for attr in old.keys():
setattr(roi_heads, attr, False)
yield
for attr in old.keys():
setattr(roi_heads, attr, old[attr])
def _batch_inference(self, batched_inputs, detected_instances=None):
"""
Execute inference on a list of inputs,
using batch size = self.batch_size, instead of the length of the list.
Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`
"""
if detected_instances is None:
detected_instances = [None] * len(batched_inputs)
outputs = []
inputs, instances = [], []
for idx, input, instance in zip(count(), batched_inputs, detected_instances):
inputs.append(input)
instances.append(instance)
if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1:
outputs.extend(
self.model.inference(
inputs,
instances if instances[0] is not None else None,
do_postprocess=False,
)
)
inputs, instances = [], []
return outputs
def __call__(self, batched_inputs):
"""
Same input/output format as :meth:`GeneralizedRCNN.forward`
"""
def _maybe_read_image(dataset_dict):
ret = copy.copy(dataset_dict)
if "image" not in ret:
image = read_image(ret.pop("file_name"), self.model.input_format)
image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW
ret["image"] = image
if "height" not in ret and "width" not in ret:
ret["height"] = image.shape[1]
ret["width"] = image.shape[2]
return ret
return [self._inference_one_image(_maybe_read_image(x)) for x in batched_inputs]
def _inference_one_image(self, input):
"""
Args:
input (dict): one dataset dict with "image" field being a CHW tensor
Returns:
dict: one output dict
"""
orig_shape = (input["height"], input["width"])
augmented_inputs, tfms = self._get_augmented_inputs(input)
# Detect boxes from all augmented versions
with self._turn_off_roi_heads(["mask_on", "keypoint_on"]):
# temporarily disable roi heads
all_boxes, all_scores, all_classes = self._get_augmented_boxes(augmented_inputs, tfms)
# merge all detected boxes to obtain final predictions for boxes
merged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape)
if self.cfg.MODEL.MASK_ON:
# Use the detected boxes to obtain masks
augmented_instances = self._rescale_detected_boxes(
augmented_inputs, merged_instances, tfms
)
# run forward on the detected boxes
outputs = self._batch_inference(augmented_inputs, augmented_instances)
# Delete now useless variables to avoid being out of memory
del augmented_inputs, augmented_instances
# average the predictions
merged_instances.pred_masks = self._reduce_pred_masks(outputs, tfms)
merged_instances = detector_postprocess(merged_instances, *orig_shape)
return {"instances": merged_instances}
else:
return {"instances": merged_instances}
def _get_augmented_inputs(self, input):
augmented_inputs = self.tta_mapper(input)
tfms = [x.pop("transforms") for x in augmented_inputs]
return augmented_inputs, tfms
def _get_augmented_boxes(self, augmented_inputs, tfms):
# 1: forward with all augmented images
outputs = self._batch_inference(augmented_inputs)
# 2: union the results
all_boxes = []
all_scores = []
all_classes = []
for output, tfm in zip(outputs, tfms):
# Need to inverse the transforms on boxes, to obtain results on original image
pred_boxes = output.pred_boxes.tensor
original_pred_boxes = tfm.inverse().apply_rotated_box(pred_boxes.cpu().numpy())
all_boxes.append(torch.from_numpy(original_pred_boxes).to(pred_boxes.device))
all_scores.extend(output.scores)
all_classes.extend(output.pred_classes)
all_boxes = torch.cat(all_boxes, dim=0)
return all_boxes, all_scores, all_classes
def _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw):
# select from the union of all results
num_boxes = len(all_boxes)
num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES
# +1 because fast_rcnn_inference expects background scores as well
all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device)
for idx, cls, score in zip(count(), all_classes, all_scores):
all_scores_2d[idx, cls] = score
merged_instances, _ = fast_rcnn_inference_single_image_rotated(
all_boxes,
all_scores_2d,
shape_hw,
1e-8,
self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
self.cfg.TEST.DETECTIONS_PER_IMAGE,
)
return merged_instances
def _rescale_detected_boxes(self, augmented_inputs, merged_instances, tfms):
augmented_instances = []
for input, tfm in zip(augmented_inputs, tfms):
# Transform the target box to the augmented image's coordinate space
pred_boxes = merged_instances.pred_boxes.tensor.cpu().numpy()
pred_boxes = torch.from_numpy(tfm.apply_rotated_box(pred_boxes))
aug_instances = Instances(
image_size=input["image"].shape[1:3],
pred_boxes=Boxes(pred_boxes),
pred_classes=merged_instances.pred_classes,
scores=merged_instances.scores,
)
augmented_instances.append(aug_instances)
return augmented_instances
def _reduce_pred_masks(self, outputs, tfms):
# Should apply inverse transforms on masks.
# We assume only resize & flip are used. pred_masks is a scale-invariant
# representation, so we handle flip specially
for output, tfm in zip(outputs, tfms):
if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
output.pred_masks = output.pred_masks.flip(dims=[3])
all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0)
avg_pred_masks = torch.mean(all_pred_masks, dim=0)
return avg_pred_masks
一些超参数调整
学习率BASE_LR调为0.01,MAX_ITER调为100000,学习率衰减STEPS调为(50000,75000)
cfg.SOLVER.BASE_LR = 0.01
cfg.SOLVER.MAX_ITER = 100000
cfg.SOLVER.STEPS = (50000,75000)
cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING = 'range'
cfg.INPUT.MIN_SIZE_TRAIN = (512, 832)
错误/无效/失败尝试
去雾
由于最初没有仔细观察数据,看见有雾就想着去雾,但由于图中多是厚云,常见的去雾代码并不能带来很好的效果,并且后面发现云下根本没数据
(但是也放一下实验后感觉效果还比较好的两个去雾算法)
改变anchor size
通过统计数据标注框的长宽,再使用kmeans聚类得出长和宽的聚类结果,再计算面积开方作为anchor.size。这里主要是因为不熟悉detectron2默认送入的anchor是尺寸和纵横比,而不是直接送入anchor的长和宽,所以参考了网上关于yolo聚类获得anchor的过程
(还是贴一个感觉不错的博客) YOLOV3中k-means聚类获得anchor boxes过程详解
数据增强
其实到最后针对旋转框的数据增强也没有做成功,但是记录一下试错的过程吧
-
直接用detectron2自带的data augmentation进行:该方法中的data augmentation策略并不是完全适用于旋转框目标检测,该方法无法针对单个类别进行数据增强
-
线下针对少类别数据进行复制并使用其他库进行数据增强:没有搜寻到适用于旋转框目标检测的数据增强库
-
在线下针对少类别数据进行复制;使用detectron2自带的data augmentation进行数据增强;再把增强后的数据转为XYWHA_ABS格式的coco数据进行训练:数据增强后不太清楚在哪里加入转换坐标
尚未实现的尝试
加入Mosaic增强
Yolov4的mosaic数据增强参考了CutMix数据增强方式, 是CutMix数据增强方法的改进版,对四张图片进行拼接,得到一张新的图片。虽然对图片做Mosaic增强的代码写好了,但因为一些原因还没有进行训练验证。
贴一下代码,这里读入的数据是json格式,输出是txt(其实是还没有改成输出也是json格式)
from PIL import Image, ImageDraw
import numpy as np
from matplotlib.colors import rgb_to_hsv, hsv_to_rgb
import math
import os
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
import pandas as pds
import json
import cv2
def rand(a=0, b=1):
return np.random.rand() * (b - a) + a
def merge_bboxes(bboxes, cutx, cuty):
merge_bbox = []
for i in range(len(bboxes)):
for box in bboxes[i]:
tmp_box = []
x1, y1, x2, y2, x3, y3, x4, y4 = box[0], box[1], box[2], box[3], box[4], box[5], box[6], box[7]
if i == 0:
if np.min(box[1::2]) > cuty or np.min(box[::2]) > cutx:
continue
if np.max(box[1::2]) >= cuty and np.min(box[1::2]) <= cuty:
box[1::2][np.argmax(box[1::2])] = cuty
if cuty - np.min(box[1::2]) < 5:
continue
if np.max(box[::2]) >= cutx and x1 <= cutx:
box[::2][np.argmax(box[::2])] = cutx
if cutx - x1 < 5:
continue
if i == 1:
if y2 < cuty or x1 > cutx:
continue
if y2 >= cuty and y1 <= cuty:
y1 = cuty
if y2 - y1 < 5:
continue
if x2 >= cutx and x1 <= cutx:
x2 = cutx
if x2 - x1 < 5:
continue
if i == 2:
if y2 < cuty or x2 < cutx:
continue
if y2 >= cuty and y1 <= cuty:
y1 = cuty
if y2 - y1 < 5:
continue
if x2 >= cutx and x1 <= cutx:
x1 = cutx
if x2 - x1 < 5:
continue
if i == 3:
if y1 > cuty or x2 < cutx:
continue
if y2 >= cuty and y1 <= cuty:
y2 = cuty
if y2 - y1 < 5:
continue
if x2 >= cutx and x1 <= cutx:
x1 = cutx
if x2 - x1 < 5:
continue
tmp_box.append(x1)
tmp_box.append(y1)
tmp_box.append(x2)
tmp_box.append(y2)
tmp_box.append(box[-1])
merge_bbox.append(tmp_box)
return merge_bbox
def get_random_data(image_file, annotation_line, input_shape):
'''random preprocessing for real-time data augmentation'''
h, w = input_shape
box_datas = []
cls_datas = []
index = 0
place_x = [0, 0, 256, 256]
place_y = [0, 256, 0, 256]
new_image = Image.new('RGB', (w, h), (128, 128, 128))
for line in annotation_line:
# 每一行进行分割
# line_content = line.split(",")
# 打开图片
path = os.path.join(image_file, line['imagePath'])
image = utils.read_image(path, format='BGR')
r = np.random.rand(2)
augs = T.AugmentationList([
T.RandomFlip(prob=0.5),
T.RandomFlip(prob=0.5, vertical=True, horizontal=False),
T.RandomApply(T.RandomBrightness(0.9, 1.1), prob=0.3),
T.RandomApply(T.RandomSaturation(0.9, 1.1), prob=0.3),
T.RandomApply(T.RandomContrast(0.9, 1.1), prob=0.3),
T.RandomApply(T.ColorTransform(lambda x: x * r[0] + r[1] * 10), prob=0.3)
])
image, transforms = T.apply_transform_gens([augs], image)
dx = place_x[index]
dy = place_y[index]
image = image[:, :, ::-1]
new_image.paste(Image.fromarray(np.uint8(image)), (dx, dy))
# cv2.imshow('new_image', new_image)
# cv2.imshow('image', Image.fromarray(np.uint8(image)))
index += 1
iw, ih = image.shape[:2]
box = []
cls = []
for shape in line['shapes']:
bbox = []
for point in shape['points']:
bbox.append(point[0])
bbox.append(point[1])
box.append(bbox)
cls.append(shape['label'])
box = np.array(box)
# box = np.array([np.array(list(map(float, box.split()[1]))) for box in line['shapes'][0:]])
# cls = [cls.split()[-2:] for cls in line['shapes']['label']]
if box.shape[-1] == 0:
continue
box = transforms.apply_coords(box.reshape(-1, 2)).clip(min=0)
# if index == 0:
# image, transforms = T.apply_transform_gens([T.RandomCrop(crop_type='absolute', crop_size=(cuty, cutx))],
# image)
# box = transforms.apply_coords(box).clip(min=0)
# if index == 1:
# image, transforms = T.apply_transform_gens(
# [T.RandomCrop(crop_type='absolute', crop_size=((h - cuty), cutx))],
# image)
# box = transforms.apply_coords(box).clip(min=0)
# box[0, :] += cutx
# if index == 3:
# image, transforms = T.apply_transform_gens(
# [T.RandomCrop(crop_type='absolute', crop_size=(cuty, (w - cutx)))],
# image)
# box = transforms.apply_coords(box).clip(min=0)
# box[1, :] += cuty
# if index == 2:
# image, transforms = T.apply_transform_gens(
# [T.RandomCrop(crop_type='absolute', crop_size=((h - cuty), (w - cutx)))],
# image)
# box = transforms.apply_coords(box).clip(min=0)
# box[0, :] += cutx
# box[1, :] += cuty
if index == 2:
box[:, 1] += 256
elif index == 3:
box[:, 0] += 256
box[:, 1] += 256
elif index == 4:
box[:, 0] += 256
box_datas.append(box)
cls_datas.extend(cls)
if len(box_datas) == 0:
return new_image, []
box_datas = np.concatenate(box_datas, axis=0)
# vis box
box_line = box_datas.reshape(-1, 8)
# for line in box_line:
# x1, y1, x2, y2, x3, y3, x4, y4 = line
# draw = ImageDraw.Draw(new_image)
# draw.line([(x1, y1), (x2, y2)], fill='red')
# draw.line([(x2, y2), (x3, y3)], fill='red')
# draw.line([(x3, y3), (x4, y4)], fill='red')
# draw.line([(x4, y4), (x1, y1)], fill='red')
pd = pds.DataFrame(box_line)
pd2 = pds.DataFrame(cls_datas)
pd = pds.concat([pd, pd2], axis=1)
return new_image, pd
def normal_(annotation_line, input_shape):
'''random preprocessing for real-time data augmentation'''
line = annotation_line.split()
image = Image.open(line[0])
box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])
iw, ih = image.size
image = image.transpose(Image.FLIP_LEFT_RIGHT)
box[:, [0, 2]] = iw - box[:, [2, 0]]
return image, box
def get_json(json_path):
info_group = []
for root, dirs, files in os.walk(json_path):
for file in files:
if file.endswith(".json"):
with open(os.path.join(root, file)) as f:
info = json.load(f)
# info = ",".join(info)
info_group.append(info)
return info_group
if __name__ == "__main__":
json_path = './train'
output_path = './train_mosaic'
json_group = get_json(json_path)
for ind in range(0, len(json_group) - 4, 4):
line = json_group[ind:ind + 4]
image_data, box_data = get_random_data(json_path, line, [512, 512])
if len(box_data) == 0:
continue
json_output_path = os.path.join(output_path, str(ind) +'.txt')
img_output_path = os.path.join(output_path, str(ind) + '.png')
js = box_data.to_json
# box_data.to_json(json_output_path)
box_data.to_csv(json_output_path, sep=' ', index=False, header=None, mode='w')
image_data.save(img_output_path)
print(ind)
print("finished")
# img = Image.fromarray((image_data * 255).astype(np.uint8))
# for j in range(len(box_data)):
# x1, y1, x2, y2, x3, y3, x4, y4 = box_data[j][0:8]
# draw = ImageDraw.Draw(img)
# draw.line([(x1, y1), (x2, y2)], fill='red')
# draw.line([(x2, y2), (x3, y3)], fill='red')
# draw.line([(x3, y3), (x4, y4)], fill='red')
# draw.line([(x4, y4), (x1, y1)], fill='red')
# # thickness = 3
# # left, top, right, bottom = box_data[j][0:4]
# # draw = ImageDraw.Draw(img)
# # for i in range(thickness):
# # draw.rectangle([left + i, top + i, right - i, bottom - i], outline=(255, 255, 255))
# img.show()
# img.save("box_all.jpg")
角度偏移改坐标偏移
也就是 RoI Transformer —> Gliding vertex
再贴下github官方代码 RoI Transformer Gliding vertex