c++ 生成模拟测序数据代码
统计真实数据不同测序位置碱基的错误率,引入到DNA片段中,从而模拟生成DNA测序数据。
·
统计真实数据不同测序位置碱基的错误率,引入到DNA片段中,从而模拟生成DNA测序数据。
参考 ART: a next-generation sequencing read simulator - PMC
#include <iostream>
#include <random>
#include <string>
#include <cassert>
using namespace std;
#define SEQ_LEN 36
inline double r_prob(){
return rand()/(double)RAND_MAX;
}
static char rand_base(){
short base=(short)ceil(r_prob()*4);
//std::cout << "r_prob()*4 " << r_prob()*4 << std::endl;
//std::cout << "ceil(r_prob()*4) " << ceil(r_prob()*4) << std::endl;
switch(base){
case 1:
return 'A';
case 2:
return 'C';
case 3:
return 'G';
case 4:
return 'T';
default:
return 'N';
}
}
//根据碱基的错误率计算质量分数
array<double, SEQ_LEN> getQList(array<double, SEQ_LEN> err_rate_lst){
array<double, SEQ_LEN> cal_qual_1st;
for(size_t i=0; i<SEQ_LEN; i++){
cal_qual_1st[i] = -10*log10(err_rate_lst[i]);
}
return cal_qual_1st;
}
//based on calibrated position-depended error rates
int add_calib_error_1st(string &seq_read, array<double, SEQ_LEN> err_rate_lst){
assert(seq_read.size() == SEQ_LEN && err_rate_lst.size()== SEQ_LEN);
int num=0;
for(size_t i=0; i<SEQ_LEN; i++){
if(r_prob() < err_rate_lst[i]) {
char achar=seq_read[i];
while(seq_read[i]==achar){ achar=rand_base(); }
seq_read[i]=achar;
num++;
}
}
return num;
}
int main() {
// 统计不同位置的测序错误概率,模拟数据
// 实际错误率比这低很多!!!
std::array<double, SEQ_LEN> cal_err_rate_1st = {0.001195, 0.001195, 0.001235, 0.0013425, 0.00146, 0.0014975, 0.001485, 0.0014525, 0.0013275, 0.0011675, 0.0011675, 0.00215, 0.0055125, 0.0097875, 0.011615, 0.01253, 0.0176525, 0.0279425, 0.0326375, 0.026815, 0.0257325, 0.03285, 0.04062, 0.0448075, 0.043745, 0.04701, 0.0504975, 0.051395, 0.05926, 0.0622425, 0.0556075, 0.054135, 0.06112, 0.082305, 0.1124975};
std::array<double, SEQ_LEN> cal_err_rate_2nd = {0.006395, 0.00664, 0.0070175, 0.007225, 0.006895, 0.006125, 0.007495, 0.00923, 0.011055, 0.0126975, 0.009685, 0.0079975, 0.009095, 0.0100525, 0.0116925, 0.01151, 0.01174, 0.01361, 0.01402, 0.01626, 0.0415125, 0.06215, 0.040825, 0.023065, 0.02608, 0.0300525, 0.0507825, 0.0699575, 0.05991, 0.0638375, 0.10265, 0.1286775, 0.1198675, 0.1104025, 0.11194};
array<double, SEQ_LEN> quality_lst_1st = getQList(cal_err_rate_1st);
std::cout << "read 质量分数: "<< std::endl;
for (int i = 0; i < SEQ_LEN; i++){
std::cout << quality_lst_1st[i] << std::endl;
}
// test data
string test_dna_seq_1, test_dna_seq_2;
for (int i = 0; i < SEQ_LEN; i++){
test_dna_seq_1 += rand_base();
}
std::cout << "原序列test_dna_seq_1: " << std::endl;
std::cout << test_dna_seq_1 << std::endl;
for (int i = 0; i < SEQ_LEN; i++){
test_dna_seq_2 += rand_base();
}
std::cout << "原序列test_dna_seq_2: " << std::endl;
std::cout << test_dna_seq_2 << std::endl;
int mut_num1 = add_calib_error_1st(test_dna_seq_1, cal_err_rate_1st);
int mut_num2 = add_calib_error_1st(test_dna_seq_2, cal_err_rate_2nd);
std::cout << "模拟测序序列test_dna_seq_1: " << std::endl;
std::cout << test_dna_seq_1 << std::endl;
std::cout << "错误个数: " << mut_num1 << std::endl;
std::cout << "模拟测序序列test_dna_seq_2: " << std::endl;
std::cout << test_dna_seq_2 << std::endl;
std::cout << "错误个数: " << mut_num2 << std::endl;
return 0;
}
更多推荐
所有评论(0)