以下是一个基于频域特征的音频静音检测算法示例,该算法结合了多个特征并基于自适应阈值进行判断,能够更好地适应不同的噪声和音频质量:

#include <iostream>
#include <fstream>
#include <vector>
#include <cmath>

constexpr int kSampleRate = 16000;    // 采样率
constexpr int kFrameSize = 320;       // 帧大小(20ms)
constexpr int kSilenceDurationThreshold = 400;   // 静音时长阈值(ms)
constexpr double kSilenceEnergyFactor = 0.4;     // 静音能量因子
constexpr double kZcrFactor = 3.5;               // 过零率因子
constexpr double kSpectralFlatnessFactor = 4.5;  // 能量谱平坦度因子
constexpr double kSpectralCentroidFactor = 1.1;  // 频谱中心性因子

double ComputeShortTimeEnergy(const std::vector<short>& frame) {
    long long energy_sum = 0;
    for (const auto& sample : frame) {
        energy_sum += sample * sample;
    }
    return static_cast<double>(energy_sum) / kFrameSize;
}

double ComputeZeroCrossingRate(const std::vector<short>& frame) {
    int zcr_cnt = 0;
    for (int i = 1; i < kFrameSize; ++i) {
        if ((frame[i] > 0 && frame[i - 1] < 0) || (frame[i] < 0 && frame[i - 1] > 0)) {
            zcr_cnt++;
        }
    }
    return static_cast<double>(zcr_cnt) / (kFrameSize - 1);
}

double ComputeSpectralFlatness(const std::vector<short>& frame) {
    std::vector<double> fft_frame(kFrameSize);
    for (int i = 0; i < kFrameSize; ++i) {
        fft_frame[i] = static_cast<double>(frame[i]);
    }

    std::vector<double> spectrum(kFrameSize / 2 + 1);
    fft(spectrum.data(), kFrameSize, fft_frame.data(), FFT_FORWARD);

    double geometric_mean = 0;
    double arithmetic_mean = 0;
    for (const auto& s : spectrum) {
        geometric_mean += log10(std::abs(s) + 1e-10);
        arithmetic_mean += std::abs(s);
    }

    geometric_mean /= spectrum.size();
    arithmetic_mean /= spectrum.size();

    return 10 * log10(geometric_mean / (arithmetic_mean + 1e-10));
}

double ComputeSpectralCentroid(const std::vector<short>& frame) {
    std::vector<double> fft_frame(kFrameSize);
    for (int i = 0; i < kFrameSize; ++i) {
        fft_frame[i] = static_cast<double>(frame[i]);
    }

    std::vector<double> spectrum(kFrameSize / 2 + 1);
    fft(spectrum.data(), kFrameSize, fft_frame.data(), FFT_FORWARD);

    double numerator = 0;
    double denominator = 0;
    for (int i = 1; i < spectrum.size(); ++i) {
        numerator += std::abs(spectrum[i]) * (i * kSampleRate / kFrameSize);
        denominator += std::abs(spectrum[i]);
    }

    return numerator / (denominator + 1e-10);
}

bool DetectSilence(const std::string& audio_file) {
    std::ifstream fin(audio_file, std::ios::binary);
    if (!fin) {
        std::cerr << "Error opening audio file.\n";
        return false;
    }

    std::vector<short> frame(kFrameSize);
    double short_time_energy = 0;
    double spectral_flatness = 0;
    double spectral_centroid = 0;
    double silence_energy_threshold = 0;
    double zcr_threshold = 0;
    double spectral_flatness_threshold = 0;
    double spectral_centroid_threshold = 0;
    int silence_frame_cnt = 0;

    while (fin.read((char*)&frame[0], kFrameSize * sizeof(short))) {
        // 计算各帧频域特征
        short_time_energy = ComputeShortTimeEnergy(frame);
        double zcr = ComputeZeroCrossingRate(frame);
        spectral_flatness = ComputeSpectralFlatness(frame);
        spectral_centroid = ComputeSpectralCentroid(frame);

        // 实时更新阈值
        if (silence_frame_cnt == 0) {
            silence_energy_threshold = kSilenceEnergyFactor * short_time_energy;
            zcr_threshold = kZcrFactor * zcr;
            spectral_flatness_threshold = kSpectralFlatnessFactor * spectral_flatness;
            spectral_centroid_threshold = kSpectralCentroidFactor * spectral_centroid;
        } else {
            silence_energy_threshold = (silence_energy_threshold * silence_frame_cnt + short_time_energy) /
                                        (silence_frame_cnt + 1);
            zcr_threshold = (zcr_threshold * silence_frame_cnt + zcr) / (silence_frame_cnt + 1);
            spectral_flatness_threshold = (spectral_flatness_threshold * silence_frame_cnt + spectral_flatness) /
                                           (silence_frame_cnt + 1);
            spectral_centroid_threshold = (spectral_centroid_threshold * silence_frame_cnt + spectral_centroid) /
                                           (silence_frame_cnt + 1);
        }
        silence_frame_cnt++;

        // 判断短时能量、过零率、能量谱平坦度和频谱中心性是否小于阈值,如果小于则认为检测到静音
        if (short_time_energy < silence_energy_threshold &&
                zcr < zcr_threshold &&
                spectral_flatness < spectral_flatness_threshold &&
                spectral_centroid < spectral_centroid_threshold) {
            // 判断是否存在连续多帧的静音
            if (silence_frame_cnt >= kSampleRate / kFrameSize * kSilenceDurationThreshold) {
                fin.close();
                return true;
            }
        } else {
            silence_frame_cnt = 0;
        }
    }

    fin.close();
    return false;
}

int main(int argc, char* argv[]) {
    if (argc != 2) {
        std::cerr << "Usage: " << argv[0] << " audio_file\n";
        return -1;
    }

    if (DetectSilence(argv[1])) {
        std::cout << "Audio file contains silence.\n";
    } else {
        std::cout << "Audio file does not contain silence.\n";
    }

    return 0;
}

该算法首先计算了帧的短时能量、过零率、能量谱平坦度和频谱中心性等四种频域特征,然后利用自适应阈值根据具体情况进行判断。其中,短时能量阈值、过零率阈值、能量谱平坦度阈值和频谱中心性阈值都是动态更新的,以适应不同的噪声和音频质量。

需要注意的是,该算法利用了 FFT 算法计算了频域特征,因此需要在编译时链接相应的 FFT 库。此外,在实际使用时,仍然需要对参数进行调整和优化,并根据具体情况进行修改和改进。

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐