GEMM of Caffe

void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input, const Dtype* weights, Dtype* output, bool skip_im2col) {
  const Dtype* col_buff = input;
  if (!is_1x1_) {
    if (!skip_im2col) {
      conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
    col_buff = col_buffer_.gpu_data();
  for (int g = 0; g < group_; ++g) {
    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
        group_, conv_out_spatial_dim_, kernel_dim_ / group_,
        (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
        (Dtype)0., output + output_offset_ * g);

Firstly, expand input feature maps by conv_im2col_gpu to matrix B. The format is similar to here, but each column stores a cube of input features convoluted by any filter (kernel/weights), so totally output_width x output_height columns.

Secondly, caffe_gpu_gemm multiplies B by weight matrix (A) , getting output feature maps C, that’s C=A*B. Each row of C is a output feature map, and in weight matrix A, each row is a filter (kernel).

Forwarding for different groups and batches are handled by loop. (Slow speed)

Function explanation:

Remark: ALL data are stored as 1D array. 
Remark: 2D, 3D and 4D can only be imagined by your brain
Function: Input feature maps are expanded to a 2D matrix, imagining expanded data as a 3D array (BLOB_X) makes it easier to understand. 
For this 3D array (noted as 'BLOB_X'): 
  The first and second dimension are the width and height of output feature maps;
  The third dimension is the number of parameters in a 3D kernel
  One executing instance of im2col_gpu_kernel deals with input feature slice connected with a 2D kernel slice (SLICE_Y)
template <typename Dtype>
__global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
    const int height, const int width, //dimemsion of input feature maps
    const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    const int height_col, const int width_col,//dimension of output feature maps
    Dtype* data_col) {
  CUDA_KERNEL_LOOP(index, n) {//index of a SLICE_Y
    int w_out = index % width_col; //offset of the 1st dimension of BLOB_X
    int h_index = index / width_col;
    int h_out = h_index % height_col;//offset of the 2nd dimension of BLOB_X
    int channel_in = h_index / height_col;//offset of the 3rd dimen of SLICE_Y
    int channel_out = channel_in * kernel_h * kernel_w;//offset of the 3rd dimension of BLOB_X
    int h_in = h_out * stride_h - pad_h;//offset of the 1st dimen of SLICE_Y
    int w_in = w_out * stride_w - pad_w;//offset of the 2nd dimen of SLICE_Y
    Dtype* data_col_ptr = data_col;
    data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
    const Dtype* data_im_ptr = data_im;
    data_im_ptr += (channel_in * height + h_in) * width + w_in;
    //store values connected with SLICE_Y
    for (int i = 0; i < kernel_h; ++i) {
      for (int j = 0; j < kernel_w; ++j) {
        int h = h_in + i;
        int w = w_in + j;
        *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
            data_im_ptr[i * width + j] : 0;
        data_col_ptr += height_col * width_col;//increment 3rd dimen of BLOB_X
As cublas follows fortran order(colunm first order), this function adapts it to C/C++ (row first order) 
template <>
void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
    const double alpha, const double* A, const double* B, const double beta,
    double* C) {
  // Note that cublas follows fortran order.
  int lda = (TransA == CblasNoTrans) ? K : M;
  int ldb = (TransB == CblasNoTrans) ? N : K;
  cublasOperation_t cuTransA =
      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t cuTransB =
      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  //C=AB <==> C'=B'A'
  //As fortran order and C/C++ order are transposingly related, transposing is done intrinsically
  CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
      N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));