bn层又坑我一次!!!!
caffe转pytorch。 由于第二次加了一些网络,不知道从哪里复制的,直接是
self.p6_conv_bn = nn.BatchNorm2d(128)
然后跑前向对精度的时候死活不一样啊!!!!
然后开始了我查找问题的漫漫之旅!!足足花了我2h。
首先就是对各种层输出看哪里不一样,比如就是conv3之前都一样,但是然后conv4不一样了!
conv3输出的featuremap一样然后经过conv4就不一样了。 这里包括conv、bn、relu
那为什么之前一样的?
是不是因为权重不一样导致的,然后又去核对转权重脚本,没发现毛病。
没办法,然后想着对权重,caffe层的权重在哪里看呢?
比如卷积层,关键的是通过
if("conv_4" == this->name())
来确定是我们需要关注的层。然后可以输出这层的featuremap的输出;
权重可以通过vector<shared_ptr> blob_learn = this->blobs();来输出!
template <typename Ftype, typename Btype>
void ConvolutionLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,const vector<Blob*>& top) {this->Quantize_gpu(bottom, top);const Ftype* weight = this->blobs_[0]->template gpu_data<Ftype>();for (int i = 0; i < bottom.size(); ++i) {const Ftype* bottom_data = bottom[i]->gpu_data<Ftype>();Ftype* top_data = top[i]->mutable_gpu_data<Ftype>();for (int n = 0; n < this->num_; ++n) {this->forward_gpu_gemm(bottom_data + n * this->bottom_dim_, weight,top_data + n * this->top_dim_);if (this->bias_term_) {const Ftype* bias = this->blobs_[1]->template gpu_data<Ftype>();this->forward_gpu_bias(top_data + n * this->top_dim_, bias);}}}this->Quantize_gpu(bottom, top);// if("conv_4" == this->name())
// {string path_conv_depth = "/media/algo/data_1/everyday/20230106/conv_4.txt";bottom[0]->save_data_to_txt(path_conv_depth);
//
// vector<shared_ptr<Blob>> blob_learn = this->blobs();
// string shape_1 = blob_learn[0]->shape_string();
// blob_learn[0]->save_data_to_txt("/media/algo/data_1/everyday/20230106/222/weight/conv_4_caffe.txt"); //caffe 保存blob值到txt,方便查看和pytorch比较https://www.cnblogs.com/yanghailin/p/17028147.html
// int a=0;
// }}
同样的bn层BatchNormLayer
template<typename Ftype, typename Btype>
void BatchNormLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom, const vector<Blob*>& top) {
...
...
...if("conv_4/bn" == this->name()){
// string path_conv_depth = "/media/algo/data_1/everyday/20230106/conv_4.txt";
// bottom[0]->save_data_to_txt(path_conv_depth);vector<shared_ptr<Blob>> blob_learn = this->blobs();string shape_0 = blob_learn[0]->shape_string();string shape_1 = blob_learn[1]->shape_string();string shape_2 = blob_learn[2]->shape_string();string shape_3 = blob_learn[3]->shape_string();string shape_4 = blob_learn[4]->shape_string();
// blob_learn[0]->save_data_to_txt("/media/algo/data_1/everyday/20230106/222/weight/conv_4.txt");int a=0;}
至于pytorch,我是打断点看权重的。
caffe里面的bn层如下写:
layer {name: "conv4_1/bn"type: "BatchNorm"bottom: "conv4_1"top: "conv4_1"batch_norm_param {moving_average_fraction: 0.995eps: 0.0001scale_bias: true}
}
caffe权重提取到本地pkl文件。
def net_prediction(net, save_weight_pkl_path="./weights202212.pkl"): #img0 [128,384,3]#feature map and shape# print("=================feature map===================")# for layer_name, blob in net.blobs.iteritems():# print(layer_name + '\t' + str(blob.data.shape))# print("=================weights===================")# for layer_name, blob in net.params.iteritems():# len_ = len(blob)# print(layer_name + " has " + str(len_) + " params")# for i in range(len_):# print(layer_name + ' idx= ' + str(i) + '\t' + str(blob[i].data.shape))############################################################################################3name_weights = {}# 保存每层的参数信息keys = open('keys.txt', 'w')keys.write('generated by VPDNet-Caffe/convert_to_pkl.py\n\n')# 遍历每一网络层for param_name in net.params.keys():name_weights[param_name] = {}# 得到此层的参数layer_params = net.params[param_name]if len(layer_params) == 1:# 如果参数只有一个,则说明是反卷积层,# SfSNet整个模型里就只有反卷积层只有一组weight参数weight = layer_params[0].dataname_weights[param_name]['weight'] = weightprint('%s:\n\t%s (weight)' % (param_name, weight.shape))keys.write('%s:\n\t%s (weight)\n' % (param_name, weight.shape))elif len(layer_params) == 2:# 如果参数有两个,则说明是卷积层或者全连接层。# 卷积层或者全连接层都有两组参数:weight和bias# 权重参数weight = layer_params[0].dataname_weights[param_name]['weight'] = weight# 偏置参数bias = layer_params[1].dataname_weights[param_name]['bias'] = biasprint('%s:\n\t%s (weight)' % (param_name, weight.shape))print('\t%s (bias)' % str(bias.shape))keys.write('%s:\n\t%s (weight)\n' % (param_name, weight.shape))keys.write('\t%s (bias)\n' % str(bias.shape))elif len(layer_params) == 5:# 如果有三个,则说明是BatchNorm层。# BN层共有三个参数,分别是:running_mean、running_var和一个缩放参数。running_mean = layer_params[0].data # running_meanlayer_params_2_data = layer_params[2].data #0layer_params_2_data = 1 #0.995# cc = layer_params[2].dataname_weights[param_name]['running_mean'] = running_mean / layer_params_2_data #name_weights[param_name]['running_mean'] = running_mean / layer_params[2].datarunning_var = layer_params[1].data # running_varname_weights[param_name]['running_var'] = running_var / layer_params_2_data #name_weights[param_name]['running_var'] = running_var / layer_params[2].data# aa = layer_params[2].data# print("bn=",aa)name_weights[param_name]['weight'] = layer_params[3].dataname_weights[param_name]['bias'] = layer_params[4].dataprint('%s:\n\t%s (running_var)' % (param_name, running_var.shape),)print('\t%s (running_mean)' % str(running_mean.shape))keys.write('%s:\n\t%s (running_var)\n' % (param_name, running_var.shape))keys.write('\t%s (running_mean)\n' % str(running_mean.shape))keys.write('\t%s (weight)\n' % str(layer_params[3].data.shape))keys.write('\t%s (bias)\n' % str(layer_params[4].data.shape))else:# 如果报错,大家要检查自己模型哈raise RuntimeError("还有参数个数超过3个的层,别漏了兄dei!!!\n")keys.close()# 保存name_weightssave_weight_pkl_pathwith open(save_weight_pkl_path, 'wb') as f:pkl.dump(name_weights, f, protocol=2)#################################################################################################
这个值moving_average_fraction: 0.995,保存在caffe bn的第三个参数,但是实际跑推理的时候好像是直接读取protxt里面的值,但是又好像是直接是1? 因为有遇到caffemodel里面bn层第三个参数是0
pytorch按照下面加载:
state_dict_vd = {}state_dict_vd['vd_conv11_1.weight'] = from_numpy(name_weights['vd_conv11_1']['weight'])state_dict_vd['vd_conv11_1.bias'] = from_numpy(name_weights['vd_conv11_1']['bias'])state_dict_vd['vd_conv11_1_bn.running_var'] = from_numpy(name_weights['vd_conv11_1_bn']['running_var'])state_dict_vd['vd_conv11_1_bn.running_mean'] = from_numpy(name_weights['vd_conv11_1_bn']['running_mean'])state_dict_vd['vd_conv11_1_bn.weight'] = from_numpy(name_weights['vd_conv11_1_bn']['weight'])state_dict_vd['vd_conv11_1_bn.bias'] = from_numpy(name_weights['vd_conv11_1_bn']['bias'])
说了这么多,但是pytroch的参数要和caffe对齐啊!!!
self.conv_4_bn = nn.BatchNorm2d(128, eps=0.0001)
一开始直接是self.conv_4_bn = nn.BatchNorm2d(128),导致精度不一样,花费了我好久才找到问题!!!