5 * Created by Sarah Westcott on 7/22/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
12 //***************************************************************************************************************
13 void DeCalculator::setMask(string m) {
17 //whereever there is a base in the mask, save that value is query and subject
18 for (int i = 0; i < seqMask.length(); i++) {
19 if (isalpha(seqMask[i])) {
26 errorOut(e, "DeCalculator", "setMask");
30 //***************************************************************************************************************
31 void DeCalculator::runMask(Sequence* seq) {
34 string q = seq->getAligned();
35 string tempQuery = "";
37 //whereever there is a base in the mask, save that value is query and subject
38 set<int>::iterator setit;
39 for ( setit=h.begin() ; setit != h.end(); setit++ ) {
40 tempQuery += q[*setit];
44 seq->setAligned(tempQuery);
45 seq->setUnaligned(tempQuery);
48 errorOut(e, "DeCalculator", "runMask");
52 //***************************************************************************************************************
53 //num is query's spot in querySeqs
54 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
57 string q = query->getAligned();
58 string s = subject->getAligned();
61 for (int i = 0; i < q.length(); i++) {
62 if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; }
66 for (int i = q.length(); i >= 0; i--) {
67 if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; }
74 errorOut(e, "DeCalculator", "trimSeqs");
78 //***************************************************************************************************************
79 //find the window breaks for each sequence - this is so you can move ahead by bases.
80 vector<int> DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
85 int cutoff = back - front; //back - front
87 //if window is set to default
88 if (size == 0) { if (cutoff > 1200) { size = 300; }
89 else{ size = (cutoff / 4); } }
90 else if (size > (cutoff / 4)) {
91 mothurOut("You have selected to large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine();
95 string seq = query->getAligned().substr(front, cutoff);
99 for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } }
102 win.push_back(front);
104 //move ahead increment bases at a time until all bases are in a window
106 int totalBases = 0; //used to eliminate window of blanks at end of sequence
108 seq = query->getAligned();
109 for (int m = front; m < (back - size) ; m++) {
111 //count number of bases you see
112 if (isalpha(seq[m])) { countBases++; totalBases++; }
114 //if you have seen enough bases to make a new window
115 if (countBases >= increment) {
116 win.push_back(m); //save spot in alignment
117 countBases = 0; //reset bases you've seen in this window
120 //no need to continue if all your bases are in a window
121 if (totalBases == numBases) { break; }
127 catch(exception& e) {
128 errorOut(e, "DeCalculator", "findWindows");
133 //***************************************************************************************************************
134 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
138 //cout << "query length = " << query->getAligned().length() << '\t' << " subject length = " << subject.getAligned().length() << endl;
139 for (int m = 0; m < window.size(); m++) {
141 string seqFrag = query->getAligned().substr(window[m], size);
142 string seqFragsub = subject->getAligned().substr(window[m], size);
143 //cout << "start point = " << window[m] << " end point = " << window[m]+size << endl;
145 for (int b = 0; b < seqFrag.length(); b++) {
147 if (seqFrag[b] != seqFragsub[b]) { diff++; }
150 //percentage of mismatched bases
152 dist = diff / (float) seqFrag.length() * 100;
154 temp.push_back(dist);
159 catch(exception& e) {
160 errorOut(e, "DeCalculator", "calcObserved");
164 //***************************************************************************************************************
165 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
168 //so you only look at the trimmed part of the sequence
169 int cutoff = back - front;
171 //from first startpoint with length back-front
172 string seqFrag = query->getAligned().substr(front, cutoff);
173 string seqFragsub = subject->getAligned().substr(front, cutoff);
176 for (int b = 0; b < seqFrag.length(); b++) {
177 if (seqFrag[b] != seqFragsub[b]) { diff++; }
180 //percentage of mismatched bases
181 float dist = diff / (float) seqFrag.length() * 100;
185 catch(exception& e) {
186 errorOut(e, "DeCalculator", "calcDist");
191 //***************************************************************************************************************
192 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
196 vector<float> queryExpected;
198 for (int m = 0; m < qav.size(); m++) {
200 float expected = qav[m] * coef;
202 queryExpected.push_back(expected);
205 return queryExpected;
208 catch(exception& e) {
209 errorOut(e, "DeCalculator", "calcExpected");
213 //***************************************************************************************************************
214 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
218 float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2
219 for (int m = 0; m < obs.size(); m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); }
221 float de = sqrt((sum / (obs.size() - 1)));
225 catch(exception& e) {
226 errorOut(e, "DeCalculator", "calcDE");
231 //***************************************************************************************************************
233 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
237 string freqfile = getRootName(filename) + "freq";
240 openOutputFile(freqfile, outFreq);
242 //at each position in the sequence
243 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
245 vector<int> freq; freq.resize(4,0);
248 //find the frequency of each nucleotide
249 for (int j = 0; j < seqs.size(); j++) {
251 char value = seqs[j]->getAligned()[i];
253 if(toupper(value) == 'A') { freq[0]++; }
254 else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; }
255 else if(toupper(value) == 'G') { freq[2]++; }
256 else if(toupper(value) == 'C') { freq[3]++; }
260 //find base with highest frequency
262 for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } }
265 //subtract gaps to "ignore them"
266 if ( (seqs.size() - gaps) == 0 ) { highFreq = 1.0; }
267 else { highFreq = highest / (float) (seqs.size() - gaps); }
270 Pi = (highFreq - 0.25) / 0.75;
272 //cannot have probability less than 0.
273 if (Pi < 0) { Pi = 0.0; }
275 //saves this for later
276 outFreq << i+1 << '\t' << highFreq << endl;
278 if (h.count(i) > 0) {
279 cout << i+1 << '\t' << highFreq << endl;
289 catch(exception& e) {
290 errorOut(e, "DeCalculator", "calcFreq");
294 //***************************************************************************************************************
295 vector<float> DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
297 vector<float> averages;
299 //for each window find average
300 for (int m = 0; m < window.size(); m++) {
304 //while you are in the window for this sequence
306 for (int j = window[m]; j < (window[m]+size); j++) {
308 //is this a spot that is included in the mask
309 if (h.count(j) > 0) {
310 average += probabilityProfile[j];
315 average = average / count;
317 //save this windows average
318 averages.push_back(average);
323 catch(exception& e) {
324 errorOut(e, "DeCalculator", "findQav");
329 //***************************************************************************************************************
330 vector< vector<float> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
332 vector< vector<float> > quan;
334 //percentage of mismatched pairs 1 to 100
339 for(int i = start; i < end; i++){
341 mothurOut("Processing template sequence " + toString(i)); mothurOutEndLine();
342 Sequence* query = seqs[i];
344 //compare to every other sequence in template
345 for(int j = 0; j < i; j++){
347 Sequence* subject = seqs[j];
350 map<int, int>::iterator it;
352 trimSeqs(query, subject, trim);
355 int front = it->first; int back = it->second;
357 //reset window for each new comparison
358 windowSizesTemplate[i] = window;
360 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
362 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
364 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
366 float alpha = getCoef(obsi, q);
368 vector<float> exp = calcExpected(q, alpha);
370 float de = calcDE(obsi, exp);
372 float dist = calcDist(query, subject, front, back);
376 //dist-1 because vector indexes start at 0.
377 quan[dist-1].push_back(de);
385 catch(exception& e) {
386 errorOut(e, "DeCalculator", "findQav");
391 //***************************************************************************************************************
392 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
395 //find average prob for this seqs windows
396 float probAverage = 0.0;
397 for (int j = 0; j < qav.size(); j++) { probAverage += qav[j]; }
398 probAverage = probAverage / (float) qav.size();
400 //find observed average
401 float obsAverage = 0.0;
402 for (int j = 0; j < obs.size(); j++) { obsAverage += obs[j]; }
403 obsAverage = obsAverage / (float) obs.size();
404 //cout << "sum ai / m = " << probAverage << endl;
405 //cout << "sum oi / m = " << obsAverage << endl;
406 float coef = obsAverage / probAverage;
410 catch(exception& e) {
411 errorOut(e, "DeCalculator", "getCoef");
415 //***************************************************************************************************************