依旧先来我自己实现的版本,首先我查阅了一下Bgr2Gray的原理:

一、核心原理

对于一个像素:

  • B:蓝色分量
  • G:绿色分量
  • R:红色分量

灰度值 Gray 通常按下面公式计算:
𝐺𝑟𝑎𝑦=0.114×𝐵+0.587×𝐺+0.299×𝑅
如果取BGR各个分量的平均值也能得到一个灰度值,但效果不够符合人眼视觉。因为人眼对不同颜色敏感程度不同:绿色最敏感,红色次之,蓝色最不敏感;所以灰度转换时,绿色权重最大,蓝色最小。

二、第一次实现

static void BM_bgr2gary(benchmark::State &state)
{
    auto bgr = imread("img.png");
    CV_Assert(!bgr.empty());
    CV_Assert(bgr.type() == CV_8UC3);
    auto col = bgr.cols;
    auto row = bgr.rows;

    const auto src_data = bgr.data;

    for (auto _ : state)
    {
        Mat gray(row, col, CV_8UC1);
        const auto gray_data = gray.data;
        for (int i = 0; i < row; ++i)
        {

            auto *bgr_row_data = src_data + (i * bgr.step);
            auto *gray_row_data = gray_data + (i * gray.step);
            for (int j = 0; j < col; ++j)
            {
                gray_row_data[j] = (29 * bgr_row_data[3 * j] + 150 * bgr_row_data[3 * j + 1] + 77 * bgr_row_data[3 * j + 2]) / 256;
            }
        }
        benchmark::DoNotOptimize(gray.data);
        benchmark::ClobberMemory();
    }
}

static void BM_cvbgr2gary(benchmark::State &state)
{
    auto bgr = imread("img.png");
    CV_Assert(!bgr.empty());
    CV_Assert(bgr.type() == CV_8UC3);
    auto col = bgr.cols;
    auto row = bgr.rows;

    for (auto _ : state)
    {
        Mat gray(row, col, CV_8UC1);
        cvtColor(bgr, gray, COLOR_BGR2GRAY);
        benchmark::DoNotOptimize(gray.data);
        benchmark::ClobberMemory();
    }
}

Benchmark Time CPU Iterations

BM_bgr2gary 275831 ns 269859 ns 2496
BM_cvbgr2gary 55085 ns 53694 ns 12049

感觉效果还行?依旧开始读代码吧

三、[cvtColor](opencv/modules/imgproc/src/color_yuv.simd.hpp at 4.x · opencv/opencv)源码阅读

//cv源码
void cvtBGRtoGray(const uchar * src_data, size_t src_step,
                  uchar * dst_data, size_t dst_step,
                  int width, int height,
                  int depth, int scn, bool swapBlue)
{
    CV_INSTRUMENT_REGION();

    int blueIdx = swapBlue ? 2 : 0;
    if( depth == CV_8U )
        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<uchar>(scn, blueIdx, 0));
    else if( depth == CV_16U )
        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<ushort>(scn, blueIdx, 0));
    else
        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<float>(scn, blueIdx, 0));
}

template<>
struct RGB2Gray<uchar>
{
    typedef uchar channel_type;

    static const int BY = BY15;
    static const int GY = GY15;
    static const int RY = RY15;
    static const int shift = gray_shift;

    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
    {
        const int coeffs0[] = { RY, GY, BY };
        for(int i = 0; i < 3; i++)
                coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]);
        if(blueIdx == 0)
            std::swap(coeffs[0], coeffs[2]);

        CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift));
    }

    void operator()(const uchar* src, uchar* dst, int n) const
    {
        int scn = srccn;
        short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
        int i = 0;

#if (CV_SIMD || CV_SIMD_SCALABLE)
        const int vsize = VTraits<v_uint8>::vlanes();
        v_int16 bg2y;
        v_int16 r12y;
        v_int16 dummy;
        v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy);
        v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy);
        v_int16 delta = vx_setall_s16(1 << (shift-1));

        for( ; i <= n-vsize;
             i += vsize, src += scn*vsize, dst += vsize)
        {
            v_uint8 r, g, b, a;
            if(scn == 3)
            {
                v_load_deinterleave(src, b, g, r);
            }
            else
            {
                v_load_deinterleave(src, b, g, r, a);
            }

            //TODO: shorten registers use when v_deinterleave is available

            v_uint16 r0, r1, g0, g1, b0, b1;
            v_expand(r, r0, r1);
            v_expand(g, g0, g1);
            v_expand(b, b0, b1);

            v_int16 bg00, bg01, bg10, bg11;
            v_int16 rd00, rd01, rd10, rd11;
            v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01);
            v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11);
            v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01);
            v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11);

            v_uint32 y00, y01, y10, y11;
            y00 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))));
            y01 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))));
            y10 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))));
            y11 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))));

            v_uint16 y0, y1;
            y0 = v_pack(y00, y01);
            y1 = v_pack(y10, y11);

            v_uint8 y = v_pack(y0, y1);
            v_store(dst, y);
        }
        vx_cleanup();
#endif

        for( ; i < n; i++, src += scn, dst++)
        {
            int b = src[0], g = src[1], r = src[2];
            uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift);
            dst[0] = y;
        }
    }

    int srccn;
    short coeffs[3];
};
  1. 外层cvtBGRtoGray的作用:
void cvtBGRtoGray(const uchar * src_data, size_t src_step,
                  uchar * dst_data, size_t dst_step,
                  int width, int height,
                  int depth, int scn, bool swapBlue)

第一,判断输入图像的数据类型

  • CV_8U :uchar
  • CV_16U :ushort
  • float
  1. 按照输入类型进行CvtColorLoop

内部的

static const int BY = BY15;
static const int GY = GY15;
static const int RY = RY15;
static const int shift = gray_shift;

本质就是 Gray=0.114B+0.587G+0.299R 但是OpenCV为了效率将它换为整数计算,再右移还原

  1. 沟槽构造函数
RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
{
    const int coeffs0[] = { RY, GY, BY };
    for(int i = 0; i < 3; i++)
            coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]);
    if(blueIdx == 0)
        std::swap(coeffs[0], coeffs[2]);

    CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift));
}

默认顺序先按 R, G, B 放,如果 blueIdx == 0,说明当前输入是 BGR,那么要交换首尾

  1. 核心计算
for( ; i < n; i++, src += scn, dst++)
{
    int b = src[0], g = src[1], r = src[2];
    uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift);
    dst[0] = y;
}
  • 先做整数加权和

  • 再除以 2^shift

  • 顺便做四舍五入

OpenCV 这里用的是 15位缩放版本,比我的八位缩放精度更高

  1. OpenCV比我快的地方
#if (CV_SIMD || CV_SIMD_SCALABLE)
        const int vsize = VTraits<v_uint8>::vlanes();
        v_int16 bg2y;
        v_int16 r12y;
        v_int16 dummy;
        v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy);
        v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy);
        v_int16 delta = vx_setall_s16(1 << (shift-1));

        for( ; i <= n-vsize;
             i += vsize, src += scn*vsize, dst += vsize)
        {
            v_uint8 r, g, b, a;
            if(scn == 3)
            {
                v_load_deinterleave(src, b, g, r);
            }
            else
            {
                v_load_deinterleave(src, b, g, r, a);
            }

            //TODO: shorten registers use when v_deinterleave is available

            v_uint16 r0, r1, g0, g1, b0, b1;
            v_expand(r, r0, r1);
            v_expand(g, g0, g1);
            v_expand(b, b0, b1);

            v_int16 bg00, bg01, bg10, bg11;
            v_int16 rd00, rd01, rd10, rd11;
            v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01);
            v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11);
            v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01);
            v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11);

            v_uint32 y00, y01, y10, y11;
            y00 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))));
            y01 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))));
            y10 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))));
            y11 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))));

            v_uint16 y0, y1;
            y0 = v_pack(y00, y01);
            y1 = v_pack(y10, y11);

            v_uint8 y = v_pack(y0, y1);
            v_store(dst, y);
        }
        vx_cleanup();
#endif

意思不是一个像素一个像素算,而是:一次加载很多个 BGR 像素,拆成 b/g/r 三组向量,并行做乘法、加法、右移,最后一次性得到一批灰度值

所以执行顺序可以理解为:

  1. 能用 SIMD 的部分,先批量算
  2. 剩下不足一批的尾巴,再走下面这个普通循环
for( ; i < n; i++, src += scn, dst++)
Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐