统计真实数据不同测序位置碱基的错误率,引入到DNA片段中,从而模拟生成DNA测序数据。

参考 ART: a next-generation sequencing read simulator - PMC

#include <iostream>
#include <random>
#include <string>
#include <cassert>

using namespace std;

#define SEQ_LEN 36


inline double r_prob(){
    return rand()/(double)RAND_MAX;
}

static char rand_base(){
    short base=(short)ceil(r_prob()*4);
    //std::cout << "r_prob()*4 " << r_prob()*4 << std::endl;
    //std::cout << "ceil(r_prob()*4) " << ceil(r_prob()*4) << std::endl;
    
    switch(base){
        case 1:
            return 'A';
        case 2:
            return 'C';
        case 3:
            return 'G';
        case 4:
            return 'T';
    default:
            return 'N';
    }
}

//根据碱基的错误率计算质量分数
array<double, SEQ_LEN> getQList(array<double, SEQ_LEN> err_rate_lst){
    array<double, SEQ_LEN> cal_qual_1st;
    for(size_t i=0; i<SEQ_LEN; i++){
        cal_qual_1st[i] = -10*log10(err_rate_lst[i]);
    }
    return cal_qual_1st;
}

 
//based on calibrated position-depended error rates
int add_calib_error_1st(string &seq_read, array<double, SEQ_LEN> err_rate_lst){
    assert(seq_read.size() == SEQ_LEN && err_rate_lst.size()== SEQ_LEN);
    
    int num=0;
    for(size_t i=0; i<SEQ_LEN; i++){
        if(r_prob() < err_rate_lst[i]) {
            char achar=seq_read[i];
            while(seq_read[i]==achar){ achar=rand_base(); }
            seq_read[i]=achar;
            num++;
        }
    }
    return num;
}

int main() {
    // 统计不同位置的测序错误概率,模拟数据
    // 实际错误率比这低很多!!!
    std::array<double, SEQ_LEN> cal_err_rate_1st = {0.001195, 0.001195, 0.001235, 0.0013425, 0.00146, 0.0014975, 0.001485, 0.0014525, 0.0013275, 0.0011675, 0.0011675, 0.00215, 0.0055125, 0.0097875, 0.011615, 0.01253, 0.0176525, 0.0279425, 0.0326375, 0.026815, 0.0257325, 0.03285, 0.04062, 0.0448075, 0.043745, 0.04701, 0.0504975, 0.051395, 0.05926, 0.0622425, 0.0556075, 0.054135, 0.06112, 0.082305, 0.1124975};

    std::array<double, SEQ_LEN> cal_err_rate_2nd = {0.006395, 0.00664, 0.0070175, 0.007225, 0.006895, 0.006125, 0.007495, 0.00923, 0.011055, 0.0126975, 0.009685, 0.0079975, 0.009095, 0.0100525, 0.0116925, 0.01151, 0.01174, 0.01361, 0.01402, 0.01626, 0.0415125, 0.06215, 0.040825, 0.023065, 0.02608, 0.0300525, 0.0507825, 0.0699575, 0.05991, 0.0638375, 0.10265, 0.1286775, 0.1198675, 0.1104025, 0.11194};
    
    array<double, SEQ_LEN> quality_lst_1st = getQList(cal_err_rate_1st);
    
    std::cout << "read 质量分数: "<< std::endl;
    for (int i = 0; i < SEQ_LEN; i++){
        std::cout << quality_lst_1st[i] << std::endl;
    }
    
    // test data
    string test_dna_seq_1, test_dna_seq_2;
    for (int i = 0; i < SEQ_LEN; i++){
        test_dna_seq_1 += rand_base();
    }
    std::cout << "原序列test_dna_seq_1: " << std::endl;
    std::cout << test_dna_seq_1 << std::endl;
    
    for (int i = 0; i < SEQ_LEN; i++){
        test_dna_seq_2 += rand_base();
    }
    
    std::cout << "原序列test_dna_seq_2: " << std::endl;
    std::cout << test_dna_seq_2 << std::endl;
    
    int mut_num1 = add_calib_error_1st(test_dna_seq_1, cal_err_rate_1st);
    int mut_num2 = add_calib_error_1st(test_dna_seq_2, cal_err_rate_2nd);
    
    std::cout << "模拟测序序列test_dna_seq_1: " << std::endl;
    std::cout << test_dna_seq_1 << std::endl;
    std::cout << "错误个数: " << mut_num1 << std::endl;
    
    std::cout << "模拟测序序列test_dna_seq_2: " << std::endl;
    std::cout << test_dna_seq_2 << std::endl;
    std::cout << "错误个数: " << mut_num2 << std::endl;
    
    return 0;
}

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐