20 Profile& operator=(const Profile&);
23 void update(const std::string&, const RefSeq&, int, int, double);
26 double getProb(const std::string&, const RefSeq&, int, int);
28 void collect(const Profile&);
33 void startSimulation();
34 std::string simulate(simul*, int, int, int, const RefSeq&);
35 void finishSimulation();
38 static const int NCODES = 5;
40 int proLen; // profile length
41 int size; // # of items in p;
42 double (*p)[NCODES][NCODES]; //profile matrices
44 double (*pc)[NCODES][NCODES]; // for simulation
47 Profile::Profile(int maxL) {
49 size = proLen * NCODES * NCODES;
50 p = new double[proLen][NCODES][NCODES];
51 memset(p, 0, sizeof(double) * size);
53 //set initial parameters
55 double probN = 1e-5, portionC = 0.99; //portionC, among ACGT, the portion of probability mass the correct base takes
58 for (int i = 0; i < proLen; i++) {
59 for (int j = 0; j < NCODES - 1; j++) {
61 probC = portionC * (1.0 - probN);
62 probO = (1.0 - portionC) / (NCODES - 2) * (1.0 - probN);
64 for (int k = 0; k < NCODES - 1; k++) {
65 p[i][j][k] = (j == k ? probC : probO);
69 for (int k = 0; k < NCODES - 1; k++)
70 p[i][N][k] = (1.0 - probN) / (NCODES - 1);
74 Profile& Profile::operator=(const Profile& rv) {
75 if (this == &rv) return *this;
76 if (proLen != rv.proLen) {
80 p = new double[rv.proLen][NCODES][NCODES];
82 memcpy(p, rv.p, sizeof(double) * rv.size);
87 void Profile::init() {
88 memset(p, 0, sizeof(double) * size);
91 void Profile::update(const std::string& readseq, const RefSeq& refseq, int pos, int dir, double frac) {
92 int len = readseq.size();
93 for (int i = 0; i < len; i++) {
94 p[i][refseq.get_id(i + pos, dir)][get_base_id(readseq[i])] += frac;
98 void Profile::finish() {
101 for (int i = 0; i < proLen; i++) {
102 for (int j = 0; j < NCODES; j++) {
104 for (int k = 0; k < NCODES; k++) sum += p[i][j][k];
106 for (int k = 0; k < NCODES; k++) p[i][j][k] = 0.0;
109 for (int k = 0; k < NCODES; k++) p[i][j][k] /= sum;
114 double Profile::getProb(const std::string& readseq, const RefSeq& refseq, int pos, int dir) {
116 int len = readseq.size();
118 for (int i = 0; i < len; i++) {
119 prob *= p[i][refseq.get_id(i + pos, dir)][get_base_id(readseq[i])];
126 void Profile::collect(const Profile& o) {
127 for (int i = 0; i < proLen; i++)
128 for (int j = 0; j < NCODES; j++)
129 for (int k = 0; k < NCODES; k++)
130 p[i][j][k] += o.p[i][j][k];
133 void Profile::read(FILE *fi) {
134 int tmp_prolen, tmp_ncodes;
135 fscanf(fi, "%d %d", &tmp_prolen, &tmp_ncodes);
136 assert(tmp_ncodes == NCODES);
137 if (tmp_prolen != proLen) {
140 size = proLen * NCODES * NCODES;
141 p = new double[proLen][NCODES][NCODES];
142 memset(p, 0, sizeof(double) * size);
145 for (int i = 0; i < proLen; i++)
146 for (int j = 0; j < NCODES; j++)
147 for (int k = 0; k < NCODES; k++)
148 fscanf(fi, "%lf", &p[i][j][k]);
151 void Profile::write(FILE* fo) {
152 fprintf(fo, "%d %d\n", proLen, NCODES);
153 for (int i = 0; i < proLen; i++) {
154 for (int j = 0; j < NCODES; j++) {
155 for (int k = 0; k < NCODES - 1; k++)
156 fprintf(fo, "%.10g ", p[i][j][k]);
157 fprintf(fo, "%.10g\n", p[i][j][NCODES - 1]);
159 if (i < proLen - 1) { fprintf(fo, "\n"); }
163 void Profile::startSimulation() {
164 pc = new double[proLen][NCODES][NCODES];
165 for (int i = 0; i < proLen; i++) {
166 for (int j = 0; j < NCODES; j++)
167 for (int k = 0; k < NCODES; k++) {
168 pc[i][j][k] = p[i][j][k];
169 if (k > 0) pc[i][j][k] += pc[i][j][k - 1];
171 //avoid sampling from 0.0!!!
172 double cp_sum, cp_d, cp_n;
173 cp_sum = cp_d = cp_n = 0.0;
175 for (int j = 0; j < NCODES - 1; j++) {
176 cp_sum += pc[i][j][NCODES - 1];
178 cp_n += p[i][j][NCODES - 1];
181 if (cp_sum == 0.0) continue;
183 double p_d, p_o, p_n;
186 p_o = (1.0 - p_d - p_n) / (NCODES - 2);
187 for (int j = 0; j < NCODES - 1; j++) {
188 if (pc[i][j][NCODES - 1] > 0.0) continue;
189 for (int k = 0; k < NCODES; k++) {
190 if (k == j) pc[i][j][k] = p_d;
191 else if (k == NCODES - 1) pc[i][j][k] = p_n;
192 else pc[i][j][k] = p_o;
193 if (k > 0) pc[i][j][k] += pc[i][j][k - 1];
196 if (pc[i][NCODES - 1][NCODES - 1] == 0.0) {
197 p_o = (1.0 - p_n) / (NCODES - 1);
198 for (int k = 0; k < NCODES; k++) {
199 pc[i][NCODES - 1][k] = (k < NCODES - 1 ? p_o : p_n);
200 if (k > 0) pc[i][NCODES - 1][k] += pc[i][NCODES - 1][k - 1];
207 std::string Profile::simulate(simul* sampler, int len, int pos, int dir, const RefSeq& refseq) {
208 std::string readseq = "";
210 for (int i = 0; i < len; i++) {
211 readseq.push_back(getCharacter(sampler->sample(pc[i][refseq.get_id(i + pos, dir)], NCODES)));
216 void Profile::finishSimulation() {
220 #endif /* PROFILE_H_ */