RoIAlign/RoIPooling CUDA源码解读

程序来自mmdetection的源码，和原版的roialign有所改动，但是效果一样。此外修复了mmdetction部分冗余代码。

mmdetection代码库注释：

本文的注释代码上传至（包括RoIPooling）：

CUDA的部分

这里宏定义函数CUDA_1D_KERNEL_LOOP(i, n)，表示线程数大于当前grid开启上限时，一直在block中循环线程计算直到完成任务。后面会传入参数实例化；
当前开辟的所有线程数是blockDim.x * gridDim.x ；
当需要并行的任务总数超过了当前开辟的所有线程数时，可以让线程循环的完成任务。一种常见的用法；
比如，一共开辟了5*2共十个线程，一共有30个任务，0号线程在干完任务0后，可以继续干任务0+10，之后可以继续干任务0+10+10；
同理1号线程可以按顺序去做任务1,11,21。

#define CUDA_1D_KERNEL_LOOP(i, n)                            \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
       i += blockDim.x * gridDim.x)

每个block开辟的线程数1024；

#define THREADS_PER_BLOCK 1024

双线性插值

template <typename scalar_t
/* scalar_t：  是一个宏，特化的时候会传入具体的类型。
bottom_data：需要进行roialign的featuremap的首地址指针（depth=1），注意特征图是(h*w)的一维数组。（关于指针和数组调用关系，参加forward函数开始的注释）
height/width：特征图的高宽
xy ： 要差值的点的坐标
*/
__device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
                                         const int height, const int width,
                                         scalar_t y, scalar_t x) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) {
    return 0;
  }

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;
 
  int y_low = (int)y;
  int x_low = (int)x;
  int y_high;
  int x_high;
  // 避免越界
  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (scalar_t)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (scalar_t)x_low;
  } else {
    x_high = x_low + 1;
  }

  // w1-w4分别是双线性插值公式四项的权重
  scalar_t ly = y - y_low;  // 采样点到下边距离
  scalar_t lx = x - x_low;  // 采样点到左边距离
  scalar_t hy = 1. - ly;    // 采样点到上边距离
  scalar_t hx = 1. - lx;    // 采样点到右边距离
  // do bilinear interpolation 

  scalar_t lt = bottom_data[y_low * width + x_low];
  scalar_t rt = bottom_data[y_low * width + x_high];
  scalar_t lb = bottom_data[y_high * width + x_low];
  scalar_t rb = bottom_data[y_high * width + x_high];
  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

  scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb);

  return val;
}

前向传播

template <typename scalar_t>
/* 
---------参数--------
nthreads：     线程总数。
bottom_data：  同上，需要进行roialign的featuremap的首地址
bottom_rois：  存储rois的首地址
spatial_scale：特征图和原图之间的比例。特征图的height/原图的height
sample_num：   采样点数
height/width： 特征图尺寸
pooled_height/pooled_width： 一般是7
top_data：pooling结果的首地址，最后的结果存储在这里。
*/
__global__ void ROIAlignForward(const int nthreads, const scalar_t *bottom_data,
                                const scalar_t *bottom_rois,
                                const scalar_t spatial_scale,
                                const int sample_num, const int channels,
                                const int height, const int width,
                                const int pooled_height, const int pooled_width,
                                scalar_t *top_data) {
  // 用函数宏定义中的内容代替，即index代替for循环中的i,nthreads代替for循环中的n
  // 表示会线程数大于当前grid开启上限时，一直在block中循环线程计算直到完成任务
  // 具体：pooling后的所有RoI像素点总数量进行同步/循环的计算，每各单独计算核单次求取一个点的坐标
  CUDA_1D_KERNEL_LOOP(index, nthreads) {  
    // (n, c, ph, pw) is an element in the aligned output

    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    /*
    offset_bottom_bottom_rois以5位单位
    0位置放当前roi属于当前batch中的第几张图片(从0开始排序)，也就是batch_index
    注意缩放：1-4位置放当前roi左上角，右下角坐标，针对 真实图像大小而言的，所以需要通过spatial_scale  缩放！！;
             spatial_scale乘子将roi坐标缩放到featuremap后，是float型，无量化损失!!!
    */

    /* 
    第一行关于指针：
    定义一个指向bottom_rois + n * 5位置的指针，指向第n个roi的首地址，其有五个参数bs x1 y1 x2 y2；
    可以直接将指针作为新的数组索引，0从当前所指的位置开始
    */
    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;   
    int roi_batch_ind = offset_bottom_rois[0];    // 指针指向的地址内容进行取出
    scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;   // 这个xyxy是ROI的坐标是要align的区域，float型
    scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;   // 分别是左上点和右下点的坐标（不是像素）
    scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;  // 这里+1应该纯粹为了避免重叠
    scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;

    // Force malformed ROIs to be 1x1
    scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);  // roi区域宽度，float，无损失（与0比较以取正值）
    scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.); // roi区域高度

    scalar_t bin_size_h = roi_height / pooled_height; // 划分成多个bin，每个bin的高和宽
    scalar_t bin_size_w = roi_width / pooled_width;

 
    const scalar_t *offset_bottom_data =
        bottom_data + (roi_batch_ind * channels + c) * height * width;  

    int sample_num_h = (sample_num > 0)   // 三目运算，设置了>0的sample_num，那么x方向取这么多个点
                           ? sample_num
                           : ceil(roi_height / pooled_height);  // e.g., = 2
    int sample_num_w =    
        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);  // y方向同理，总共2*2=4个采样点

    // 下面四行代码更本没用，就是抄人的挪过来忘了删掉，它实现的方式插值都形式不一样，这四个变量完全没必要
    scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h; // h/w是bin中心点在特征图的坐标（也就是bottom）
    scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
    int hstart = fminf(floor(h), height - 2); //和width-2比较取较小值，是因为现在求的是左上角，要给右下角留下位置，不能让右下角超出featuremap范围
    int wstart = fminf(floor(w), width - 2);

    scalar_t output_val = 0;
    // y方向遍历
    for (int iy = 0; iy < sample_num_h; iy++) {
      // 计算采样点的y坐标：roi的h + bin的位置（如：7*7的第几个bin）+ bin内的偏移（bin宽高除以采样点个数）
      const scalar_t y = roi_start_h + ph * bin_size_h +
                         (scalar_t)(iy + scalar_t(.5f)) * bin_size_h /
                             (scalar_t)(sample_num_h);
      // x方向遍历采样
      for (int ix = 0; ix < sample_num_w; ix++) {
        // 计算采样点x坐标，原理相同不赘述
        const scalar_t x = roi_start_w + pw * bin_size_w +
                           (scalar_t)(ix + scalar_t(.5f)) * bin_size_w /
                               (scalar_t)(sample_num_w);
        scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data,
                                                      height, width, y, x);  // 双线性插值得到结果
        output_val += val;
      }
    }
    output_val /= (sample_num_h * sample_num_w);    // 这里的align取值方式是均值
    // 最终Roi上的这个点插值计算完毕赋值即可
    top_data[index] = output_val;
  }
}

反向传播

原理类似不赘述，见github的注释。

发布于 2019-07-25 16:55

CUDA

ROI

RoIAlign/RoIPooling CUDA源码解读

CUDA的部分

双线性插值

前向传播

反向传播

文章被以下专栏收录

计算机视觉小记