静音检测算法优化版本
该算法首先计算了帧的短时能量、过零率、能量谱平坦度和频谱中心性等四种频域特征,然后利用自适应阈值根据具体情况进行判断。其中,短时能量阈值、过零率阈值、能量谱平坦度阈值和频谱中心性阈值都是动态更新的,以适应不同的噪声和音频质量。需要注意的是,该算法利用了 FFT 算法计算了频域特征,因此需要在编译时链接相应的 FFT 库。此外,在实际使用时,仍然需要对参数进行调整和优化,并根据具体情况进行修改和改进
·
以下是一个基于频域特征的音频静音检测算法示例,该算法结合了多个特征并基于自适应阈值进行判断,能够更好地适应不同的噪声和音频质量:
#include <iostream>
#include <fstream>
#include <vector>
#include <cmath>
constexpr int kSampleRate = 16000; // 采样率
constexpr int kFrameSize = 320; // 帧大小(20ms)
constexpr int kSilenceDurationThreshold = 400; // 静音时长阈值(ms)
constexpr double kSilenceEnergyFactor = 0.4; // 静音能量因子
constexpr double kZcrFactor = 3.5; // 过零率因子
constexpr double kSpectralFlatnessFactor = 4.5; // 能量谱平坦度因子
constexpr double kSpectralCentroidFactor = 1.1; // 频谱中心性因子
double ComputeShortTimeEnergy(const std::vector<short>& frame) {
long long energy_sum = 0;
for (const auto& sample : frame) {
energy_sum += sample * sample;
}
return static_cast<double>(energy_sum) / kFrameSize;
}
double ComputeZeroCrossingRate(const std::vector<short>& frame) {
int zcr_cnt = 0;
for (int i = 1; i < kFrameSize; ++i) {
if ((frame[i] > 0 && frame[i - 1] < 0) || (frame[i] < 0 && frame[i - 1] > 0)) {
zcr_cnt++;
}
}
return static_cast<double>(zcr_cnt) / (kFrameSize - 1);
}
double ComputeSpectralFlatness(const std::vector<short>& frame) {
std::vector<double> fft_frame(kFrameSize);
for (int i = 0; i < kFrameSize; ++i) {
fft_frame[i] = static_cast<double>(frame[i]);
}
std::vector<double> spectrum(kFrameSize / 2 + 1);
fft(spectrum.data(), kFrameSize, fft_frame.data(), FFT_FORWARD);
double geometric_mean = 0;
double arithmetic_mean = 0;
for (const auto& s : spectrum) {
geometric_mean += log10(std::abs(s) + 1e-10);
arithmetic_mean += std::abs(s);
}
geometric_mean /= spectrum.size();
arithmetic_mean /= spectrum.size();
return 10 * log10(geometric_mean / (arithmetic_mean + 1e-10));
}
double ComputeSpectralCentroid(const std::vector<short>& frame) {
std::vector<double> fft_frame(kFrameSize);
for (int i = 0; i < kFrameSize; ++i) {
fft_frame[i] = static_cast<double>(frame[i]);
}
std::vector<double> spectrum(kFrameSize / 2 + 1);
fft(spectrum.data(), kFrameSize, fft_frame.data(), FFT_FORWARD);
double numerator = 0;
double denominator = 0;
for (int i = 1; i < spectrum.size(); ++i) {
numerator += std::abs(spectrum[i]) * (i * kSampleRate / kFrameSize);
denominator += std::abs(spectrum[i]);
}
return numerator / (denominator + 1e-10);
}
bool DetectSilence(const std::string& audio_file) {
std::ifstream fin(audio_file, std::ios::binary);
if (!fin) {
std::cerr << "Error opening audio file.\n";
return false;
}
std::vector<short> frame(kFrameSize);
double short_time_energy = 0;
double spectral_flatness = 0;
double spectral_centroid = 0;
double silence_energy_threshold = 0;
double zcr_threshold = 0;
double spectral_flatness_threshold = 0;
double spectral_centroid_threshold = 0;
int silence_frame_cnt = 0;
while (fin.read((char*)&frame[0], kFrameSize * sizeof(short))) {
// 计算各帧频域特征
short_time_energy = ComputeShortTimeEnergy(frame);
double zcr = ComputeZeroCrossingRate(frame);
spectral_flatness = ComputeSpectralFlatness(frame);
spectral_centroid = ComputeSpectralCentroid(frame);
// 实时更新阈值
if (silence_frame_cnt == 0) {
silence_energy_threshold = kSilenceEnergyFactor * short_time_energy;
zcr_threshold = kZcrFactor * zcr;
spectral_flatness_threshold = kSpectralFlatnessFactor * spectral_flatness;
spectral_centroid_threshold = kSpectralCentroidFactor * spectral_centroid;
} else {
silence_energy_threshold = (silence_energy_threshold * silence_frame_cnt + short_time_energy) /
(silence_frame_cnt + 1);
zcr_threshold = (zcr_threshold * silence_frame_cnt + zcr) / (silence_frame_cnt + 1);
spectral_flatness_threshold = (spectral_flatness_threshold * silence_frame_cnt + spectral_flatness) /
(silence_frame_cnt + 1);
spectral_centroid_threshold = (spectral_centroid_threshold * silence_frame_cnt + spectral_centroid) /
(silence_frame_cnt + 1);
}
silence_frame_cnt++;
// 判断短时能量、过零率、能量谱平坦度和频谱中心性是否小于阈值,如果小于则认为检测到静音
if (short_time_energy < silence_energy_threshold &&
zcr < zcr_threshold &&
spectral_flatness < spectral_flatness_threshold &&
spectral_centroid < spectral_centroid_threshold) {
// 判断是否存在连续多帧的静音
if (silence_frame_cnt >= kSampleRate / kFrameSize * kSilenceDurationThreshold) {
fin.close();
return true;
}
} else {
silence_frame_cnt = 0;
}
}
fin.close();
return false;
}
int main(int argc, char* argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " audio_file\n";
return -1;
}
if (DetectSilence(argv[1])) {
std::cout << "Audio file contains silence.\n";
} else {
std::cout << "Audio file does not contain silence.\n";
}
return 0;
}
该算法首先计算了帧的短时能量、过零率、能量谱平坦度和频谱中心性等四种频域特征,然后利用自适应阈值根据具体情况进行判断。其中,短时能量阈值、过零率阈值、能量谱平坦度阈值和频谱中心性阈值都是动态更新的,以适应不同的噪声和音频质量。
需要注意的是,该算法利用了 FFT 算法计算了频域特征,因此需要在编译时链接相应的 FFT 库。此外,在实际使用时,仍然需要对参数进行调整和优化,并根据具体情况进行修改和改进。
更多推荐
所有评论(0)