OpenCV源码阅读二:cvtColor阅读

hu443556637

466人浏览 · 2026-03-12 21:57:04

hu443556637 · 2026-03-12 21:57:04 发布

依旧先来我自己实现的版本,首先我查阅了一下Bgr2Gray的原理:

一、核心原理

对于一个像素：

B：蓝色分量
G：绿色分量
R：红色分量

灰度值 Gray 通常按下面公式计算：
𝐺𝑟𝑎𝑦=0.114×𝐵+0.587×𝐺+0.299×𝑅
如果取BGR各个分量的平均值也能得到一个灰度值，但效果不够符合人眼视觉。因为人眼对不同颜色敏感程度不同：绿色最敏感,红色次之,蓝色最不敏感;所以灰度转换时，绿色权重最大，蓝色最小。

二、第一次实现

static void BM_bgr2gary(benchmark::State &state)
{
    auto bgr = imread("img.png");
    CV_Assert(!bgr.empty());
    CV_Assert(bgr.type() == CV_8UC3);
    auto col = bgr.cols;
    auto row = bgr.rows;

    const auto src_data = bgr.data;

    for (auto _ : state)
    {
        Mat gray(row, col, CV_8UC1);
        const auto gray_data = gray.data;
        for (int i = 0; i < row; ++i)
        {

            auto *bgr_row_data = src_data + (i * bgr.step);
            auto *gray_row_data = gray_data + (i * gray.step);
            for (int j = 0; j < col; ++j)
            {
                gray_row_data[j] = (29 * bgr_row_data[3 * j] + 150 * bgr_row_data[3 * j + 1] + 77 * bgr_row_data[3 * j + 2]) / 256;
            }
        }
        benchmark::DoNotOptimize(gray.data);
        benchmark::ClobberMemory();
    }
}

static void BM_cvbgr2gary(benchmark::State &state)
{
    auto bgr = imread("img.png");
    CV_Assert(!bgr.empty());
    CV_Assert(bgr.type() == CV_8UC3);
    auto col = bgr.cols;
    auto row = bgr.rows;

    for (auto _ : state)
    {
        Mat gray(row, col, CV_8UC1);
        cvtColor(bgr, gray, COLOR_BGR2GRAY);
        benchmark::DoNotOptimize(gray.data);
        benchmark::ClobberMemory();
    }
}

Benchmark Time CPU Iterations

BM_bgr2gary 275831 ns 269859 ns 2496
BM_cvbgr2gary 55085 ns 53694 ns 12049

感觉效果还行?依旧开始读代码吧

三、[cvtColor](opencv/modules/imgproc/src/color_yuv.simd.hpp at 4.x · opencv/opencv)源码阅读

//cv源码
void cvtBGRtoGray(const uchar * src_data, size_t src_step,
                  uchar * dst_data, size_t dst_step,
                  int width, int height,
                  int depth, int scn, bool swapBlue)
{
    CV_INSTRUMENT_REGION();

    int blueIdx = swapBlue ? 2 : 0;
    if( depth == CV_8U )
        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<uchar>(scn, blueIdx, 0));
    else if( depth == CV_16U )
        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<ushort>(scn, blueIdx, 0));
    else
        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<float>(scn, blueIdx, 0));
}

template<>
struct RGB2Gray<uchar>
{
    typedef uchar channel_type;

    static const int BY = BY15;
    static const int GY = GY15;
    static const int RY = RY15;
    static const int shift = gray_shift;

    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
    {
        const int coeffs0[] = { RY, GY, BY };
        for(int i = 0; i < 3; i++)
                coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]);
        if(blueIdx == 0)
            std::swap(coeffs[0], coeffs[2]);

        CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift));
    }

    void operator()(const uchar* src, uchar* dst, int n) const
    {
        int scn = srccn;
        short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
        int i = 0;

#if (CV_SIMD || CV_SIMD_SCALABLE)
        const int vsize = VTraits<v_uint8>::vlanes();
        v_int16 bg2y;
        v_int16 r12y;
        v_int16 dummy;
        v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy);
        v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy);
        v_int16 delta = vx_setall_s16(1 << (shift-1));

        for( ; i <= n-vsize;
             i += vsize, src += scn*vsize, dst += vsize)
        {
            v_uint8 r, g, b, a;
            if(scn == 3)
            {
                v_load_deinterleave(src, b, g, r);
            }
            else
            {
                v_load_deinterleave(src, b, g, r, a);
            }

            //TODO: shorten registers use when v_deinterleave is available

            v_uint16 r0, r1, g0, g1, b0, b1;
            v_expand(r, r0, r1);
            v_expand(g, g0, g1);
            v_expand(b, b0, b1);

            v_int16 bg00, bg01, bg10, bg11;
            v_int16 rd00, rd01, rd10, rd11;
            v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01);
            v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11);
            v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01);
            v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11);

            v_uint32 y00, y01, y10, y11;
            y00 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))));
            y01 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))));
            y10 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))));
            y11 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))));

            v_uint16 y0, y1;
            y0 = v_pack(y00, y01);
            y1 = v_pack(y10, y11);

            v_uint8 y = v_pack(y0, y1);
            v_store(dst, y);
        }
        vx_cleanup();
#endif

        for( ; i < n; i++, src += scn, dst++)
        {
            int b = src[0], g = src[1], r = src[2];
            uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift);
            dst[0] = y;
        }
    }

    int srccn;
    short coeffs[3];
};

外层cvtBGRtoGray的作用:

void cvtBGRtoGray(const uchar * src_data, size_t src_step,
                  uchar * dst_data, size_t dst_step,
                  int width, int height,
                  int depth, int scn, bool swapBlue)

第一，判断输入图像的数据类型

CV_8U :uchar
CV_16U :ushort
float

按照输入类型进行CvtColorLoop

内部的

static const int BY = BY15;
static const int GY = GY15;
static const int RY = RY15;
static const int shift = gray_shift;

本质就是 Gray=0.114B+0.587G+0.299R 但是OpenCV为了效率将它换为整数计算,再右移还原

沟槽构造函数

RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
{
    const int coeffs0[] = { RY, GY, BY };
    for(int i = 0; i < 3; i++)
            coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]);
    if(blueIdx == 0)
        std::swap(coeffs[0], coeffs[2]);

    CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift));
}

默认顺序先按 R, G, B 放,如果 blueIdx == 0，说明当前输入是 BGR，那么要交换首尾

核心计算

for( ; i < n; i++, src += scn, dst++)
{
    int b = src[0], g = src[1], r = src[2];
    uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift);
    dst[0] = y;
}

先做整数加权和
再除以 2^shift
顺便做四舍五入

OpenCV 这里用的是 15位缩放版本，比我的八位缩放精度更高

OpenCV比我快的地方

#if (CV_SIMD || CV_SIMD_SCALABLE)
        const int vsize = VTraits<v_uint8>::vlanes();
        v_int16 bg2y;
        v_int16 r12y;
        v_int16 dummy;
        v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy);
        v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy);
        v_int16 delta = vx_setall_s16(1 << (shift-1));

        for( ; i <= n-vsize;
             i += vsize, src += scn*vsize, dst += vsize)
        {
            v_uint8 r, g, b, a;
            if(scn == 3)
            {
                v_load_deinterleave(src, b, g, r);
            }
            else
            {
                v_load_deinterleave(src, b, g, r, a);
            }

            //TODO: shorten registers use when v_deinterleave is available

            v_uint16 r0, r1, g0, g1, b0, b1;
            v_expand(r, r0, r1);
            v_expand(g, g0, g1);
            v_expand(b, b0, b1);

            v_int16 bg00, bg01, bg10, bg11;
            v_int16 rd00, rd01, rd10, rd11;
            v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01);
            v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11);
            v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01);
            v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11);

            v_uint32 y00, y01, y10, y11;
            y00 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))));
            y01 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))));
            y10 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))));
            y11 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))));

            v_uint16 y0, y1;
            y0 = v_pack(y00, y01);
            y1 = v_pack(y10, y11);

            v_uint8 y = v_pack(y0, y1);
            v_store(dst, y);
        }
        vx_cleanup();
#endif

意思不是一个像素一个像素算，而是：一次加载很多个 BGR 像素,拆成 b/g/r 三组向量,并行做乘法、加法、右移,最后一次性得到一批灰度值

所以执行顺序可以理解为：

能用 SIMD 的部分，先批量算
剩下不足一批的尾巴，再走下面这个普通循环

for( ; i < n; i++, src += scn, dst++)

腾讯云开发者社区

腾讯云面向开发者汇聚海量精品云计算使用和开发经验，营造开放的云计算技术生态圈。

更多推荐

Elasticsearch复杂数据类型终极指南：从入门到精通

Elasticsearch作为功能强大的搜索引擎，支持多种复杂数据类型，让开发者能够灵活处理各种结构化和非结构化数据。本文将带你全面了解Elasticsearch中的复杂数据类型，从基础概念到实际应用，助你轻松掌握数据建模的核心技巧。## 内部对象：构建层级化数据结构在Elasticsearch中，对象类型（Object）是最基础的复杂数据类型之一，用于表示具有嵌套关系的数据。例如，我们可

腾讯云开发者社区

终极指南：Flink SQL连接器版本管理从混乱到有序的升级之路

Apache Flink作为流处理领域的佼佼者，其SQL连接器的版本管理一直是开发者面临的核心挑战。本文将系统讲解Flink SQL连接器版本管理的最佳实践，帮助你轻松应对版本兼容性问题，实现从混乱到有序的升级之旅。## 连接器版本管理的常见痛点 😫在Flink应用开发中，连接器版本管理常常让开发者头疼不已。不同版本的连接器可能导致各种兼容性问题，例如API变更、功能差异甚至运行时错误。

腾讯云开发者社区

如何快速搭建Neon无服务器PostgreSQL：面向初学者的完整指南

Neon是一款革命性的无服务器PostgreSQL解决方案，它通过分离存储和计算层，实现了自动扩缩容、类代码式数据库分支以及零级扩展能力。本指南将帮助你从零开始搭建Neon开发环境，体验这款创新数据库的强大功能。## 准备工作：环境要求与依赖项在开始搭建Neon环境前，请确保你的系统满足以下要求：- Linux操作系统（推荐Ubuntu 20.04+或Debian 11+）- Git