openpose训练代码(一): http://blog.csdn.net/u011956147/article/details/79292026
openpose训练代码(二):http://blog.csdn.net/u011956147/article/details/79292734
openspoe本身是很繁杂的,包含了人体姿态估计、手势估计、脸部关键点提取,还有3Dpose,是在caffe上再做的一层封装,但是如果我们实际要去用的话,很多其实都是不需要的,比如openpose里面的多线程,GUI等等,我们只需要关注一些核心的东西就好了。
在这里,我们只关心openpose中的人体关键点估计,其实在上一篇博客中,我们可以大致了解到,Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields就是CVPR6016的CPM加上PAF,inference是很直观的,就是提取关键点,算PAF积分,再把关键点放到每个group(就是确定是不是同一个人)完成多人的姿态估计。
训练代码,其实主要就是看数据准备和数据读取,主要包括几个文件:
数据读取文件:
cpm_data_layer.cpp
cpm_data_transformer.cpp
数据准备文件:
genCOCOMask.m
genJSON.m
genLMDB.py
getANNO.m
cpm_data_layer和cpm_data_transformer都是在caffe中实现的,要理清楚这两个文件,我们需要先看一下数据准备是怎么做的,这里,也只是关注LMDB文件是怎么生成的,因为其他的都比较简单(其实生成LMDB也蛮简单的,但是作者这部分写的有点乱,需要静心好好梳理)可以自行查阅。
在genLMDB.py中,把事先处理好的数据都写入LMDB中,其中有一个函数writeLMDB,这个函数就是逐行,逐页面(这里的页面可以理解长channel,因为在读取的时候都是利用指针移动)来写入的:
def writeLMDB(datasets, lmdb_path, validation):
env = lmdb.open(lmdb_path, map_size=int(1e12)) # 需要先建立一个空文件夹用来放LMDB文件,大概需要140G
txn = env.begin(write=True)
data = []
numSample = 0
for d in range(len(datasets)):
if(datasets[d] == "MPI"):
print datasets[d]
with open('MPI.json') as data_file:
data_this = json.load(data_file)
data_this = data_this['root']
data = data + data_this
numSample = len(data)
#print data
print numSample
elif(datasets[d] == "COCO"): # 读json文件
print datasets[d]
with open('dataset/COCO/json/COCO.json') as data_file:
data_this = json.load(data_file)
data_this = data_this['root']
data = data + data_this
numSample = len(data)
#print data
print numSample
random_order = np.random.permutation(numSample).tolist()
isValidationArray = [data[i]['isValidation'] for i in range(numSample)];
if(validation == 1):
totalWriteCount = isValidationArray.count(0.0);
else:
totalWriteCount = len(data)
print totalWriteCount;
writeCount = 0
for count in range(numSample):# numSample
#idx = random_order[count]
idx = 3
if (data[idx]['isValidation'] != 0 and validation == 1):
print '%d/%d skipped' % (count,idx)
continue
if "MPI" in data[idx]['dataset']:
path_header = 'dataset/MPI/images/'
elif "COCO" in data[idx]['dataset']:
path_header = '/proj/Sunjiarui/fcm_pose_train/training/dataset/COCO/images/'
print os.path.join(path_header, data[idx]['img_paths'])
img = cv2.imread(os.path.join(path_header, data[idx]['img_paths']))
#print data[idx]['img_paths']
img_idx = data[idx]['img_paths'][-16:-3];
#print img_idx
# 做mask_all 和mask_miss 这里是因为有一些人比较小,没有标注,但是又存在,所以才有这一步
if "COCO_val" in data[idx]['dataset']:
mask_all = cv2.imread(path_header+'mask2014/val2014_mask_all_'+img_idx+'png', 0)
mask_miss = cv2.imread(path_header+'mask2014/val2014_mask_miss_'+img_idx+'png', 0)
#print path_header+'mask2014/val2014_mask_miss_'+img_idx+'png'
elif "COCO" in data[idx]['dataset']:
mask_all = cv2.imread(path_header+'mask2014/train2014_mask_all_'+img_idx+'png', 0)
mask_miss = cv2.imread(path_header+'mask2014/train2014_mask_miss_'+img_idx+'png', 0)
#print path_header+'mask2014/train2014_mask_miss_'+img_idx+'png'
elif "MPI" in data[idx]['dataset']:
img_idx = data[idx]['img_paths'][-13:-3];
#print img_idx
mask_miss = cv2.imread('dataset/MPI/masks/mask_'+img_idx+'jpg', 0)
#mask_all = mask_miss
height = img.shape[0]
width = img.shape[1]
if(width < 64):
img = cv2.copyMakeBorder(img,0,0,0,64-width,cv2.BORDER_CONSTANT,value=(128,128,128))
print 'saving padded image!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
cv2.imwrite('padded_img.jpg', img)
width = 64
# no modify on width, because we want to keep information
meta_data = np.zeros(shape=(height,width,1), dtype=np.uint8)
#print type(img), img.shape
#print type(meta_data), meta_data.shape
clidx = 0 # current line index
# dataset name (string)
for i in range(len(data[idx]['dataset'])):
meta_data[clidx][i] = ord(data[idx]['dataset'][i])
print 'type()=', type(ord(data[idx]['dataset'][i]))
# 开始准备mata信息
clidx = clidx + 1
# image height, image width
height_binary = float2bytes(data[idx]['img_height'])
for i in range(len(height_binary)):
meta_data[clidx][i] = ord(height_binary[i])
width_binary = float2bytes(data[idx]['img_width'])
print 'type(width_binary)=',type(width_binary)
for i in range(len(width_binary)):
meta_data[clidx][4+i] = ord(width_binary[i])
clidx = clidx + 1
# (a) isValidation(uint8), numOtherPeople (uint8), people_index (uint8), annolist_index (float), writeCount(float), totalWriteCount(float)
meta_data[clidx][0] = data[idx]['isValidation']
meta_data[clidx][1] = data[idx]['numOtherPeople']
meta_data[clidx][2] = data[idx]['people_index']
print 'type() =', type(data[idx]['isValidation'])
print 'data numOther = ',data[idx]['numOtherPeople']
annolist_index_binary = float2bytes(data[idx]['annolist_index'])
for i in range(len(annolist_index_binary)): # 3,4,5,6
meta_data[clidx][3+i] = ord(annolist_index_binary[i])
count_binary = float2bytes(float(writeCount)) # note it's writecount instead of count!
for i in range(len(count_binary)):
meta_data[clidx][7+i] = ord(count_binary[i])
totalWriteCount_binary = float2bytes(float(totalWriteCount))
for i in range(len(totalWriteCount_binary)):
meta_data[clidx][11+i] = ord(totalWriteCount_binary[i])
nop = int(data[idx]['numOtherPeople'])
clidx = clidx + 1
# (b) objpos_x (float), objpos_y (float)
objpos_binary = float2bytes(data[idx]['objpos'])
for i in range(len(objpos_binary)):
meta_data[clidx][i] = ord(objpos_binary[i])
clidx = clidx + 1
# (c) scale_provided (float)
scale_provided_binary = float2bytes(data[idx]['scale_provided'])
for i in range(len(scale_provided_binary)):
meta_data[clidx][i] = ord(scale_provided_binary[i])
clidx = clidx + 1
# (d) joint_self (3*16) (float) (3 line)
joints = np.asarray(data[idx]['joint_self']).T.tolist() # transpose to 3*16
for i in range(len(joints)):
row_binary = float2bytes(joints[i])
for j in range(len(row_binary)):
meta_data[clidx][j] = ord(row_binary[j])
clidx = clidx + 1
# (e) check nop, prepare arrays
print 'nop=',nop
if(nop!=0):
if(nop==1):
joint_other = [data[idx]['joint_others']]
objpos_other = [data[idx]['objpos_other']]
scale_provided_other = [data[idx]['scale_provided_other']]
print 'joint_other=',joint_other
else:
joint_other = data[idx]['joint_others']
objpos_other = data[idx]['objpos_other']
scale_provided_other = data[idx]['scale_provided_other']
print 'joint_others2 =', joint_other
# (f) objpos_other_x (float), objpos_other_y (float) (nop lines)
for i in range(nop):
objpos_binary = float2bytes(objpos_other[i])
for j in range(len(objpos_binary)):
meta_data[clidx][j] = ord(objpos_binary[j])
clidx = clidx + 1
# (g) scale_provided_other (nop floats in 1 line)
scale_provided_other_binary = float2bytes(scale_provided_other)
for j in range(len(scale_provided_other_binary)):
meta_data[clidx][j] = ord(scale_provided_other_binary[j])
clidx = clidx + 1
# (h) joint_others (3*16) (float) (nop*3 lines)
for n in range(nop):
joints = np.asarray(joint_other[n]).T.tolist() # transpose to 3*16
print 'joints=',joints
print 'joint_other[n]=', joint_other[n]
for i in range(len(joints)):
row_binary = float2bytes(joints[i])
for j in range(len(row_binary)):
meta_data[clidx][j] = ord(row_binary[j])
clidx = clidx + 1
# print meta_data[0:12,0:48]
# total 7+4*nop lines
# lmdb排列的顺序一定要记清楚,这个在读取数据的时候很重要,在C++代码中相关联的就是指针的偏移量
if "COCO" in data[idx]['dataset']:
img4ch = np.concatenate((img, meta_data, mask_miss[...,None], mask_all[...,None]), axis=2)
#img4ch = np.concatenate((img, meta_data, mask_miss[...,None]), axis=2)
elif "MPI" in data[idx]['dataset']:
img4ch = np.concatenate((img, meta_data, mask_miss[...,None]), axis=2)
img4ch = np.transpose(img4ch, (2, 0, 1))
print img4ch.shape
datum = caffe.io.array_to_datum(img4ch, label=0)
key = '%07d' % writeCount
txn.put(key, datum.SerializeToString())
if(writeCount % 1000 == 0):
txn.commit()
txn = env.begin(write=True)
print '%d/%d/%d/%d' % (count,writeCount,idx,numSample)
writeCount = writeCount + 1
txn.commit()
env.close()
在上述Python代码过后,就会生成训练所需要的LMDB文件,在实际的使用过程中,需要重新写caffe的data_layer,关于caffe的data_layer ,可以参考我之前的一篇博客: http://mp.blog.csdn.net/mdeditor/77987504
下面是cpm_data_layer和cpm_data_transformer,其实cpm_data_layer主要就是layer的建立,主要的数据转化都是在cpm_data_transformer中完成的。
先看cpm_data_layer的setup函数(代码有些细微地方我可能改过):
template <typename Dtype>
void CPMDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
cpm_data_transformer_.reset(
new CPMDataTransformer<Dtype>(cpm_transform_param_, this->phase_));
cpm_data_transformer_->InitRand();
// Read a data point, and use it to initialize the top blob.
Datum& datum = *(reader_.full().peek());
LOG(INFO) << datum.height() << " " << datum.width() << " " << datum.channels();
bool force_color = this->layer_param_.data_param().force_encoded_color();
if ((force_color && DecodeDatum(&datum, true)) ||
DecodeDatumNative(&datum)) {
LOG(INFO) << "Decoding Datum";
}
// image
const int crop_size = this->layer_param_.cpm_transform_param().crop_size();
const int batch_size = this->layer_param_.data_param().batch_size();
if (crop_size > 0) {
// top[0]->Reshape(batch_size, datum.channels(), crop_size, crop_size);
// for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
// this->prefetch_[i].data_.Reshape(batch_size, datum.channels(), crop_size, crop_size);
// }
// //this->transformed_data_.Reshape(1, 4, crop_size, crop_size);
// this->transformed_data_.Reshape(1, 6, crop_size, crop_size);
}
else {
const int height = this->phase_ != TRAIN ? datum.height() :
this->layer_param_.cpm_transform_param().crop_size_y();
const int width = this->phase_ != TRAIN ? datum.width() :
this->layer_param_.cpm_transform_param().crop_size_x();
LOG(INFO) << "PREFETCH_COUNT is " << this->PREFETCH_COUNT; // asynchronously if to GPU memory
top[0]->Reshape(batch_size, datum.channels(), height, width);
for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
this->prefetch_[i].data_.Reshape(batch_size, datum.channels(), height, width); // 10,6,368,368
}
//this->transformed_data_.Reshape(1, 4, height, width);
this->transformed_data_.Reshape(1, datum.channels(), height, width); // 1,6,368,368
}
LOG(INFO) << "output data size: " << top[0]->num() << ","
<< top[0]->channels() << "," << top[0]->height() << ","
<< top[0]->width(); // 10,6,368,368
// label
if (this->output_labels_) {
const int stride = this->layer_param_.cpm_transform_param().stride(); // 8,重要
const int height = this->phase_ != TRAIN ? datum.height() :
this->layer_param_.cpm_transform_param().crop_size_y();
const int width = this->phase_ != TRAIN ? datum.width() :
this->layer_param_.cpm_transform_param().crop_size_x();
int num_parts = this->layer_param_.cpm_transform_param().num_parts(); // 56
top[1]->Reshape(batch_size, 2*(num_parts+1), height/stride, width/stride);
for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
this->prefetch_[i].label_.Reshape(batch_size, 2*(num_parts+1), height/stride, width/stride); // 10,114,46,46
}
this->transformed_label_.Reshape(1, 2*(num_parts+1), height/stride, width/stride); // 1,114,46,46
}
}
在这个函数中,主要是就一些超参数的读取,和数据输出格式的规定。
关键的是load_batch 函数,我截取了一部分:
// Apply data transformations (mirror, scale, crop...)
timer.Start();
const int offset_data = batch->data_.offset(item_id);
const int offset_label = batch->label_.offset(item_id);
this->transformed_data_.set_cpu_data(top_data + offset_data);
this->transformed_label_.set_cpu_data(top_label + offset_label);
if (datum.encoded()) {
this->cpm_data_transformer_->Transform(cv_img, &(this->transformed_data_));
} else {
this->cpm_data_transformer_->Transform_nv(datum,
&(this->transformed_data_),
&(this->transformed_label_), cnt);
++cnt;
}
// if (this->output_labels_) {
// top_label[item_id] = datum.label();
// }
trans_time += timer.MicroSeconds();
这里调用Transform和Transform_nv 就进入了cpm_data_transformer文件。
下一篇写cpm_data_transformer。
原文链接:http://blog.csdn.net/u011956147/article/details/79292026