OpenCV源码阅读二:cvtColor阅读
意思不是一个像素一个像素算,而是:一次加载很多个 BGR 像素,拆成 b/g/r 三组向量,并行做乘法、加法、右移,最后一次性得到一批灰度值。本质就是 Gray=0.114B+0.587G+0.299R 但是OpenCV为了效率将它换为整数计算,再右移还原。如果取BGR各个分量的平均值也能得到一个灰度值,但效果不够符合人眼视觉。所以灰度转换时,绿色权重最大,蓝色最小。第一,判断输入图像的数据类型。
依旧先来我自己实现的版本,首先我查阅了一下Bgr2Gray的原理:
一、核心原理
对于一个像素:
- B:蓝色分量
- G:绿色分量
- R:红色分量
灰度值 Gray 通常按下面公式计算:
𝐺𝑟𝑎𝑦=0.114×𝐵+0.587×𝐺+0.299×𝑅
如果取BGR各个分量的平均值也能得到一个灰度值,但效果不够符合人眼视觉。因为人眼对不同颜色敏感程度不同:绿色最敏感,红色次之,蓝色最不敏感;所以灰度转换时,绿色权重最大,蓝色最小。
二、第一次实现
static void BM_bgr2gary(benchmark::State &state)
{
auto bgr = imread("img.png");
CV_Assert(!bgr.empty());
CV_Assert(bgr.type() == CV_8UC3);
auto col = bgr.cols;
auto row = bgr.rows;
const auto src_data = bgr.data;
for (auto _ : state)
{
Mat gray(row, col, CV_8UC1);
const auto gray_data = gray.data;
for (int i = 0; i < row; ++i)
{
auto *bgr_row_data = src_data + (i * bgr.step);
auto *gray_row_data = gray_data + (i * gray.step);
for (int j = 0; j < col; ++j)
{
gray_row_data[j] = (29 * bgr_row_data[3 * j] + 150 * bgr_row_data[3 * j + 1] + 77 * bgr_row_data[3 * j + 2]) / 256;
}
}
benchmark::DoNotOptimize(gray.data);
benchmark::ClobberMemory();
}
}
static void BM_cvbgr2gary(benchmark::State &state)
{
auto bgr = imread("img.png");
CV_Assert(!bgr.empty());
CV_Assert(bgr.type() == CV_8UC3);
auto col = bgr.cols;
auto row = bgr.rows;
for (auto _ : state)
{
Mat gray(row, col, CV_8UC1);
cvtColor(bgr, gray, COLOR_BGR2GRAY);
benchmark::DoNotOptimize(gray.data);
benchmark::ClobberMemory();
}
}
Benchmark Time CPU Iterations
BM_bgr2gary 275831 ns 269859 ns 2496
BM_cvbgr2gary 55085 ns 53694 ns 12049
感觉效果还行?依旧开始读代码吧
三、[cvtColor](opencv/modules/imgproc/src/color_yuv.simd.hpp at 4.x · opencv/opencv)源码阅读
//cv源码
void cvtBGRtoGray(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int depth, int scn, bool swapBlue)
{
CV_INSTRUMENT_REGION();
int blueIdx = swapBlue ? 2 : 0;
if( depth == CV_8U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<uchar>(scn, blueIdx, 0));
else if( depth == CV_16U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<ushort>(scn, blueIdx, 0));
else
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<float>(scn, blueIdx, 0));
}
template<>
struct RGB2Gray<uchar>
{
typedef uchar channel_type;
static const int BY = BY15;
static const int GY = GY15;
static const int RY = RY15;
static const int shift = gray_shift;
RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
{
const int coeffs0[] = { RY, GY, BY };
for(int i = 0; i < 3; i++)
coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]);
if(blueIdx == 0)
std::swap(coeffs[0], coeffs[2]);
CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift));
}
void operator()(const uchar* src, uchar* dst, int n) const
{
int scn = srccn;
short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
int i = 0;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int vsize = VTraits<v_uint8>::vlanes();
v_int16 bg2y;
v_int16 r12y;
v_int16 dummy;
v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy);
v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy);
v_int16 delta = vx_setall_s16(1 << (shift-1));
for( ; i <= n-vsize;
i += vsize, src += scn*vsize, dst += vsize)
{
v_uint8 r, g, b, a;
if(scn == 3)
{
v_load_deinterleave(src, b, g, r);
}
else
{
v_load_deinterleave(src, b, g, r, a);
}
//TODO: shorten registers use when v_deinterleave is available
v_uint16 r0, r1, g0, g1, b0, b1;
v_expand(r, r0, r1);
v_expand(g, g0, g1);
v_expand(b, b0, b1);
v_int16 bg00, bg01, bg10, bg11;
v_int16 rd00, rd01, rd10, rd11;
v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01);
v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11);
v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01);
v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11);
v_uint32 y00, y01, y10, y11;
y00 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))));
y01 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))));
y10 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))));
y11 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))));
v_uint16 y0, y1;
y0 = v_pack(y00, y01);
y1 = v_pack(y10, y11);
v_uint8 y = v_pack(y0, y1);
v_store(dst, y);
}
vx_cleanup();
#endif
for( ; i < n; i++, src += scn, dst++)
{
int b = src[0], g = src[1], r = src[2];
uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift);
dst[0] = y;
}
}
int srccn;
short coeffs[3];
};
- 外层
cvtBGRtoGray的作用:
void cvtBGRtoGray(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int depth, int scn, bool swapBlue)
第一,判断输入图像的数据类型
- CV_8U :uchar
- CV_16U :ushort
- float
- 按照输入类型进行
CvtColorLoop
内部的
static const int BY = BY15;
static const int GY = GY15;
static const int RY = RY15;
static const int shift = gray_shift;
本质就是 Gray=0.114B+0.587G+0.299R 但是OpenCV为了效率将它换为整数计算,再右移还原
沟槽构造函数
RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
{
const int coeffs0[] = { RY, GY, BY };
for(int i = 0; i < 3; i++)
coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]);
if(blueIdx == 0)
std::swap(coeffs[0], coeffs[2]);
CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift));
}
默认顺序先按 R, G, B 放,如果 blueIdx == 0,说明当前输入是 BGR,那么要交换首尾
- 核心计算
for( ; i < n; i++, src += scn, dst++)
{
int b = src[0], g = src[1], r = src[2];
uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift);
dst[0] = y;
}
-
先做整数加权和
-
再除以 2^shift
-
顺便做四舍五入
OpenCV 这里用的是 15位缩放版本,比我的八位缩放精度更高
- OpenCV比我快的地方
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int vsize = VTraits<v_uint8>::vlanes();
v_int16 bg2y;
v_int16 r12y;
v_int16 dummy;
v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy);
v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy);
v_int16 delta = vx_setall_s16(1 << (shift-1));
for( ; i <= n-vsize;
i += vsize, src += scn*vsize, dst += vsize)
{
v_uint8 r, g, b, a;
if(scn == 3)
{
v_load_deinterleave(src, b, g, r);
}
else
{
v_load_deinterleave(src, b, g, r, a);
}
//TODO: shorten registers use when v_deinterleave is available
v_uint16 r0, r1, g0, g1, b0, b1;
v_expand(r, r0, r1);
v_expand(g, g0, g1);
v_expand(b, b0, b1);
v_int16 bg00, bg01, bg10, bg11;
v_int16 rd00, rd01, rd10, rd11;
v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01);
v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11);
v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01);
v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11);
v_uint32 y00, y01, y10, y11;
y00 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))));
y01 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))));
y10 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))));
y11 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))));
v_uint16 y0, y1;
y0 = v_pack(y00, y01);
y1 = v_pack(y10, y11);
v_uint8 y = v_pack(y0, y1);
v_store(dst, y);
}
vx_cleanup();
#endif
意思不是一个像素一个像素算,而是:一次加载很多个 BGR 像素,拆成 b/g/r 三组向量,并行做乘法、加法、右移,最后一次性得到一批灰度值
所以执行顺序可以理解为:
- 能用 SIMD 的部分,先批量算
- 剩下不足一批的尾巴,再走下面这个普通循环
for( ; i < n; i++, src += scn, dst++)
更多推荐
所有评论(0)