From 2ad3477dcd6a01628712b32e767353f917f2a95a Mon Sep 17 00:00:00 2001 From: westcott Date: Wed, 22 Jul 2009 11:35:33 +0000 Subject: [PATCH] changed sequence class so that when constructor is called aligned and unaligned values are filled, also worked on chimeras --- bellerophon.h | 1 + chimera.h | 1 + chimeraseqscommand.cpp | 11 +- chimeraseqscommand.h | 2 +- pintail.cpp | 724 +++++++++++++++++++++++------------------ pintail.h | 31 +- sequence.cpp | 13 +- 7 files changed, 451 insertions(+), 332 deletions(-) diff --git a/bellerophon.h b/bellerophon.h index 05e93d7..32b38e4 100644 --- a/bellerophon.h +++ b/bellerophon.h @@ -40,6 +40,7 @@ class Bellerophon : public Chimera { void print(ostream&); void setCons(string){}; + void setQuantiles(string) {}; private: diff --git a/chimera.h b/chimera.h index e0e8944..2febd00 100644 --- a/chimera.h +++ b/chimera.h @@ -35,6 +35,7 @@ class Chimera { virtual void setIncrement(int i) { increment = i; } virtual void setCons(string) {}; + virtual void setQuantiles(string) {}; //pure functions diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index 28e3e6a..08c02a8 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -22,7 +22,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else { //valid paramters for this command - string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation" }; + string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantiles" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -48,6 +48,10 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ if (consfile == "not open") { abort = true; } else if (consfile == "not found") { consfile = ""; } + quanfile = validParameter.validFile(parameters, "quantiles", true); + if (quanfile == "not open") { abort = true; } + else if (quanfile == "not found") { consfile = ""; } + string temp; temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "T"; } @@ -111,8 +115,13 @@ int ChimeraSeqsCommand::execute(){ if (method == "bellerophon") { chimera = new Bellerophon(fastafile); } else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile); + //saves time to avoid generating it if (consfile != "") { chimera->setCons(consfile); } else { chimera->setCons(""); } + + //saves time to avoid generating it + if (quanfile != "") { chimera->setQuantiles(quanfile); } + else { chimera->setQuantiles(""); } }else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; } //set user options diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h index 9bc8418..7721ef8 100644 --- a/chimeraseqscommand.h +++ b/chimeraseqscommand.h @@ -28,7 +28,7 @@ public: private: bool abort; - string method, fastafile, templatefile, consfile; + string method, fastafile, templatefile, consfile, quanfile; bool filter, correction; int processors, midpoint, averageLeft, averageRight, window, iters, increment; Chimera* chimera; diff --git a/pintail.cpp b/pintail.cpp index 5f4abd0..72be679 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -8,7 +8,7 @@ */ #include "pintail.h" -#include "eachgapdist.h" +#include "ignoregaps.h" //*************************************************************************************************************** @@ -68,9 +68,10 @@ void Pintail::getChimeras() { DE.resize(numSeqs); Qav.resize(numSeqs); bestfit.resize(numSeqs); - trim.resize(numSeqs); deviation.resize(numSeqs); + trimmed.resize(numSeqs); windowSizes.resize(numSeqs, window); + windows.resize(numSeqs); //break up file if needed int linesPerProcess = processors / numSeqs; @@ -91,27 +92,48 @@ void Pintail::getChimeras() { lines.push_back(new linePair(0, numSeqs)); #endif - distcalculator = new eachGapDist(); - + distcalculator = new ignoreGaps(); + + if (processors == 1) { mothurOut("Finding closest sequence in template to each sequence... "); cout.flush(); bestfit = findPairs(lines[0]->start, lines[0]->end); -for (int m = 0; m < templateSeqs.size(); m++) { - if (templateSeqs[m]->getName() == "198806") { bestfit[0] = *(templateSeqs[m]); } - if (templateSeqs[m]->getName() == "198806") { bestfit[1] = *(templateSeqs[m]); } - if (templateSeqs[m]->getName() == "108139") { bestfit[2] = *(templateSeqs[m]); } -} - -for (int j = 0; j < bestfit.size(); j++) {//cout << querySeqs[j]->getName() << '\t' << "length = " << querySeqs[j]->getAligned().length() << '\t' << bestfit[j].getName() << " length = " << bestfit[j].getAligned().length() << endl; +/*for (int m = 0; m < templateSeqs.size(); m++) { + if (templateSeqs[m]->getName() == "159481") { bestfit[17] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "100137") { bestfit[16] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "112956") { bestfit[15] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "102326") { bestfit[14] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "66229") { bestfit[13] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "206276") { bestfit[12] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "63607") { bestfit[11] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "7056") { bestfit[10] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "7088") { bestfit[9] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "17553") { bestfit[8] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "131723") { bestfit[7] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "69013") { bestfit[6] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "24543") { bestfit[5] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "27824") { bestfit[4] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "1456") { bestfit[3] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "1456") { bestfit[2] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "141312") { bestfit[1] = *(templateSeqs[m]); } + if (templateSeqs[m]->getName() == "141312") { bestfit[0] = *(templateSeqs[m]); } + + +}*/ + + for (int j = 0; j < bestfit.size(); j++) { //chops off beginning and end of sequences so they both start and end with a base trimSeqs(querySeqs[j], bestfit[j], j); -//cout << "NEW SEQ PAIR" << querySeqs[j]->getAligned() << endl << "IN THE MIDDLE" << endl << bestfit[j].getAligned() << endl; - -} - + } mothurOut("Done."); mothurOutEndLine(); + + for (int i = lines[0]->start; i < lines[0]->end; i++) { + it = trimmed[i].begin(); + vector win = findWindows(querySeqs[i], it->first, it->second, windowSizes[i]); + windows[i] = win; + } - windows = findWindows(lines[0]->start, lines[0]->end); + } else { createProcessesSpots(); } //find P @@ -124,43 +146,67 @@ for (int j = 0; j < bestfit.size(); j++) {//cout << querySeqs[j]->getName() << ' if (processors == 1) { mothurOut("Calculating observed distance... "); cout.flush(); - obsDistance = calcObserved(lines[0]->start, lines[0]->end); + for (int i = lines[0]->start; i < lines[0]->end; i++) { + vector obsi = calcObserved(querySeqs[i], bestfit[i], windows[i], windowSizes[i]); + obsDistance[i] = obsi; + } mothurOut("Done."); mothurOutEndLine(); + + mothurOut("Finding variability... "); cout.flush(); - Qav = findQav(lines[0]->start, lines[0]->end); -for (int i = 0; i < Qav.size(); i++) { -cout << querySeqs[i]->getName() << " = "; -for (int u = 0; u < Qav[i].size();u++) { cout << Qav[i][u] << '\t'; } -cout << endl << endl; -} - - + for (int i = lines[0]->start; i < lines[0]->end; i++) { + vector q = findQav(windows[i], windowSizes[i]); + Qav[i] = q; + } mothurOut("Done."); mothurOutEndLine(); + + mothurOut("Calculating alpha... "); cout.flush(); - seqCoef = getCoef(lines[0]->start, lines[0]->end); -for (int i = 0; i < seqCoef.size(); i++) { -cout << querySeqs[i]->getName() << " coef = " << seqCoef[i] << endl; -} - + for (int i = lines[0]->start; i < lines[0]->end; i++) { + float alpha = getCoef(obsDistance[i], Qav[i]); + seqCoef.push_back(alpha); + } mothurOut("Done."); mothurOutEndLine(); - + + + mothurOut("Calculating expected distance... "); cout.flush(); - expectedDistance = calcExpected(lines[0]->start, lines[0]->end); + for (int i = lines[0]->start; i < lines[0]->end; i++) { + vector exp = calcExpected(Qav[i], seqCoef[i]); + expectedDistance[i] = exp; + } mothurOut("Done."); mothurOutEndLine(); - mothurOut("Finding deviation... "); cout.flush(); - DE = calcDE(lines[0]->start, lines[0]->end); - deviation = calcDist(lines[0]->start, lines[0]->end); - mothurOut("Done."); mothurOutEndLine(); + mothurOut("Finding deviation... "); cout.flush(); + for (int i = lines[0]->start; i < lines[0]->end; i++) { + float de = calcDE(obsDistance[i], expectedDistance[i]); + DE[i] = de; + + it = trimmed[i].begin(); + float dist = calcDist(querySeqs[i], bestfit[i], it->first, it->second); + deviation[i] = dist; + } + mothurOut("Done."); mothurOutEndLine(); } else { createProcesses(); } delete distcalculator; + + //quantiles are used to determine whether the de values found indicate a chimera + //if you have to calculate them, its time intensive because you are finding the de and deviation values for each + //combination of sequences in the template + if (quanfile != "") { readQuantiles(); } + else { + if (processors == 1) { + quantiles = getQuantiles(lines[0]->start, lines[0]->end); + }else { createProcessesQuan(); } + } + //free memory for (int i = 0; i < lines.size(); i++) { delete lines[i]; } @@ -186,9 +232,9 @@ vector Pintail::readSeqs(string file) { while(!in.eof()){ Sequence* current = new Sequence(in); - if (current->getAligned() == "") { current->setAligned(current->getUnaligned()); } + //if (current->getAligned() == "") { current->setAligned(current->getUnaligned()); } //takes out stuff is needed - current->setUnaligned(current->getUnaligned()); + //current->setUnaligned(current->getUnaligned()); container.push_back(current); @@ -207,35 +253,32 @@ vector Pintail::readSeqs(string file) { //*************************************************************************************************************** //num is query's spot in querySeqs -void Pintail::trimSeqs(Sequence* query, Sequence& subject, int num) { +map Pintail::trimSeqs(Sequence* query, Sequence subject, int num) { try { - + string q = query->getAligned(); string s = subject.getAligned(); - + int front = 0; for (int i = 0; i < q.length(); i++) { if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; } } - q = q.substr(front, q.length()); - s = s.substr(front, s.length()); - int back = 0; for (int i = q.length(); i >= 0; i--) { if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; } } - - q = q.substr(0, back); - s = s.substr(0, back); - - trim[num][front] = back; - - //save - query->setAligned(q); - query->setUnaligned(q); - subject.setAligned(s); - subject.setUnaligned(s); + + //if num = -1 then you are calling this from quantiles + if (num != -1) { + trimmed[num][front] = back; + return trimmed[num]; + } + + map temp; + temp[front] = back; + return temp; + } catch(exception& e) { errorOut(e, "Pintail", "trimSeqs"); @@ -254,13 +297,14 @@ vector Pintail::readFreq() { vector prob; //read in probabilities and store in vector - int pos; float num; + int pos; float num; while(!in.eof()){ in >> pos >> num; - prob.push_back(num); + //do you want this spot + prob.push_back(num); gobble(in); } @@ -275,6 +319,49 @@ vector Pintail::readFreq() { } } +//*************************************************************************************************************** + +vector< vector > Pintail::readQuantiles() { + try { + + ifstream in; + openInputFile(quanfile, in); + + vector< vector > quan; + + //read in probabilities and store in vector + int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; + + while(!in.eof()){ + + in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; + + vector temp; + + temp.push_back(ten); + temp.push_back(twentyfive); + temp.push_back(fifty); + temp.push_back(seventyfive); + temp.push_back(ninetyfive); + temp.push_back(ninetynine); + + //do you want this spot + quan.push_back(temp); + + gobble(in); + } + + in.close(); + return quan; + + } + catch(exception& e) { + errorOut(e, "Pintail", "readQuantiles"); + exit(1); + } +} + + //*************************************************************************************************************** //calculate the distances from each query sequence to all sequences in the template to find the closest sequence vector Pintail::findPairs(int start, int end) { @@ -311,63 +398,51 @@ vector Pintail::findPairs(int start, int end) { } //*************************************************************************************************************** -//find the window breaks for each sequence -vector< vector > Pintail::findWindows(int start, int end) { +//find the window breaks for each sequence - this is so you can move ahead by bases. +vector Pintail::findWindows(Sequence* query, int front, int back, int& size) { try { - vector< vector > win; win.resize(end-start); + vector win; - //for each sequence - int count = 0; - for(int i = start; i < end; i++){ + int cutoff = back - front; //back - front - //if window is set to default - if (windowSizes[i] == 0) { if (querySeqs[i]->getAligned().length() > 1200) { windowSizes[i] = 300; } - else{ windowSizes[i] = (querySeqs[i]->getAligned().length() / 4); } } - else if (windowSizes[i] > (querySeqs[i]->getAligned().length() / 4)) { - mothurOut("You have selected to large a window size for sequence " + querySeqs[i]->getName() + ". I will choose an appropriate window size."); mothurOutEndLine(); - windowSizes[i] = (querySeqs[i]->getAligned().length() / 4); - } + //if window is set to default + if (size == 0) { if (cutoff > 1200) { size = 300; } + else{ size = (cutoff / 4); } } + else if (size > (cutoff / 4)) { + mothurOut("You have selected to large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine(); + size = (cutoff / 4); + } - //cout << "length = " << querySeqs[i]->getAligned().length() << " window = " << windowSizes[i] << " increment = " << increment << endl; - - - string seq = querySeqs[i]->getAligned(); - int numBases = querySeqs[i]->getUnaligned().length(); - int spot = 0; + string seq = query->getAligned().substr(front, cutoff); - //find location of first base - for (int j = 0; j < seq.length(); j++) { - if (isalpha(seq[j])) { spot = j; break; } - } - - //save start of seq - win[count].push_back(spot); + //count bases + int numBases = 0; + for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } } + //save start of seq + win.push_back(front); + + //move ahead increment bases at a time until all bases are in a window + int countBases = 0; + int totalBases = 0; //used to eliminate window of blanks at end of sequence - //move ahead increment bases at a time until all bases are in a window - int countBases = 0; - int totalBases = 0; //used to eliminate window of blanks at end of sequence - for (int m = spot; m < seq.length(); m++) { + seq = query->getAligned(); + for (int m = front; m < (back - size) ; m++) { - //count number of bases you see - if (isalpha(seq[m])) { countBases++; totalBases++; } - - //if you have seen enough bases to make a new window - if (countBases >= increment) { - win[count].push_back(m); //save spot in alignment - countBases = 0; //reset bases you've seen in this window - } + //count number of bases you see + if (isalpha(seq[m])) { countBases++; totalBases++; } - //no need to continue if all your bases are in a window - if (totalBases == numBases) { break; } + //if you have seen enough bases to make a new window + if (countBases >= increment) { + win.push_back(m); //save spot in alignment + countBases = 0; //reset bases you've seen in this window } - - count++; + + //no need to continue if all your bases are in a window + if (totalBases == numBases) { break; } } - - - + return win; } @@ -378,46 +453,31 @@ vector< vector > Pintail::findWindows(int start, int end) { } //*************************************************************************************************************** -vector< vector > Pintail::calcObserved(int start, int end) { +vector Pintail::calcObserved(Sequence* query, Sequence subject, vector window, int size) { try { - vector< vector > temp; - temp.resize(end-start); - - int count = 0; - for(int i = start; i < end; i++){ - - Sequence* query = querySeqs[i]; - Sequence subject = bestfit[i]; - - int startpoint = 0; - for (int m = 0; m < windows[i].size(); m++) { - - string seqFrag = query->getAligned().substr(windows[i][startpoint], windowSizes[i]); - string seqFragsub = subject.getAligned().substr(windows[i][startpoint], windowSizes[i]); + vector temp; + + int startpoint = 0; + for (int m = 0; m < windows.size(); m++) { + + string seqFrag = query->getAligned().substr(window[startpoint], size); + string seqFragsub = subject.getAligned().substr(window[startpoint], size); - int diff = 0; - for (int b = 0; b < seqFrag.length(); b++) { - - //if either the query or subject is not a gap - if ((isalpha(seqFrag[b])) || (isalpha(seqFragsub[b]))) { - //and they are different - penalize - if (seqFrag[b] != seqFragsub[b]) { diff++; } - } - } + int diff = 0; + for (int b = 0; b < seqFrag.length(); b++) { + if (seqFrag[b] != seqFragsub[b]) { diff++; } + } - //percentage of mismatched bases - float dist; - dist = diff / (float) seqFrag.length() * 100; + //percentage of mismatched bases + float dist; + dist = diff / (float) seqFrag.length() * 100; - temp[count].push_back(dist); + temp.push_back(dist); - startpoint++; - } - - count++; + startpoint++; } - + return temp; } catch(exception& e) { @@ -426,37 +486,25 @@ vector< vector > Pintail::calcObserved(int start, int end) { } } //*************************************************************************************************************** -vector Pintail::calcDist(int start, int end) { +float Pintail::calcDist(Sequence* query, Sequence subject, int front, int back) { try { - vector temp; - - for(int i = start; i < end; i++){ - - Sequence* query = querySeqs[i]; - Sequence subject = bestfit[i]; + //so you only look at the trimmed part of the sequence + int cutoff = back - front; - string seqFrag = query->getAligned(); - string seqFragsub = subject.getAligned(); + //from first startpoint with length back-front + string seqFrag = query->getAligned().substr(front, cutoff); + string seqFragsub = subject.getAligned().substr(front, cutoff); - int diff = 0; - for (int b = 0; b < seqFrag.length(); b++) { - - //if either the query or subject is not a gap - if ((isalpha(seqFrag[b])) || (isalpha(seqFragsub[b]))) { - //and they are different - penalize - if (seqFrag[b] != seqFragsub[b]) { diff++; } - } - } + int diff = 0; + for (int b = 0; b < seqFrag.length(); b++) { + if (seqFrag[b] != seqFragsub[b]) { diff++; } + } - //percentage of mismatched bases - float dist; - dist = diff / (float) seqFrag.length() * 100; + //percentage of mismatched bases + float dist = diff / (float) seqFrag.length() * 100; - temp.push_back(dist); - } - - return temp; + return dist; } catch(exception& e) { errorOut(e, "Pintail", "calcDist"); @@ -465,32 +513,20 @@ vector Pintail::calcDist(int start, int end) { } //*************************************************************************************************************** -vector< vector > Pintail::calcExpected(int start, int end) { +vector Pintail::calcExpected(vector qav, float coef) { try { - vector< vector > temp; temp.resize(end-start); - - //for each sequence - int count = 0; - for(int i = start; i < end; i++){ - - float coef = seqCoef[i]; - - //for each window - vector queryExpected; - for (int m = 0; m < windows[i].size(); m++) { - float expected = Qav[i][m] * coef; - queryExpected.push_back(expected); -//cout << "average variabilty over window = " << averageProbability[m] << " coef = " << coef << " ei = " << expected << '\t' << "window = " << m << endl; - } - - temp[count] = queryExpected; - - count++; + //for each window + vector queryExpected; + for (int m = 0; m < qav.size(); m++) { + + float expected = qav[m] * coef; + + queryExpected.push_back(expected); } - - return temp; + + return queryExpected; } catch(exception& e) { @@ -499,30 +535,16 @@ vector< vector > Pintail::calcExpected(int start, int end) { } } //*************************************************************************************************************** -vector Pintail::calcDE(int start, int end) { +float Pintail::calcDE(vector obs, vector exp) { try { - vector temp; temp.resize(end-start); - - //for each sequence - int count = 0; - for(int i = start; i < end; i++){ - - vector obs = obsDistance[i]; - vector exp = expectedDistance[i]; - -// cout << "difference between obs and exp = " << abs(obs[m] - exp[m]) << endl; - //for each window - float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2 - for (int m = 0; m < windows[i].size(); m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); } + //for each window + float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2 + for (int m = 0; m < obsDistance.size(); m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); } - float de = sqrt((sum / (windows[i].size() - 1))); + float de = sqrt((sum / (obsDistance.size() - 1))); - temp[count] = de; - count++; - } - - return temp; + return de; } catch(exception& e) { errorOut(e, "Pintail", "calcDE"); @@ -543,7 +565,7 @@ vector Pintail::calcFreq(vector seqs) { //at each position in the sequence for (int i = 0; i < seqs[0]->getAligned().length(); i++) { - + vector freq; freq.resize(4,0); int gaps = 0; @@ -551,7 +573,7 @@ vector Pintail::calcFreq(vector seqs) { for (int j = 0; j < seqs.size(); j++) { char value = seqs[j]->getAligned()[i]; - + if(toupper(value) == 'A') { freq[0]++; } else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; } else if(toupper(value) == 'G') { freq[2]++; } @@ -564,17 +586,20 @@ vector Pintail::calcFreq(vector seqs) { for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } } float highFreq; - //if ( (seqs.size() - gaps) == 0 ) { highFreq = 1.0; } - //else { highFreq = highest / (float) (seqs.size() - gaps); } - highFreq = highest / (float) seqs.size(); -cout << i << '\t' << highFreq << endl; + if ( (seqs.size() - gaps) == 0 ) { highFreq = 1.0; } + else { highFreq = highest / (float) (seqs.size() - gaps); } + //highFreq = highest / (float) seqs.size(); + //cout << i << '\t' << highFreq << endl; float Pi; Pi = (highFreq - 0.25) / 0.75; + //cannot have probability less than 0. + if (Pi < 0) { Pi = 0.0; } + //saves this for later - outFreq << i << '\t' << Pi << endl; - + outFreq << i+1 << '\t' << Pi << endl; + prob.push_back(Pi); } @@ -589,34 +614,24 @@ cout << i << '\t' << highFreq << endl; } } //*************************************************************************************************************** -vector< vector > Pintail::findQav(int start, int end) { +vector Pintail::findQav(vector window, int size) { try { - vector< vector > averages; - map::iterator it; - - for(int i = start; i < end; i++){ - - //for each window find average - vector temp; - for (int m = 0; m < windows[i].size(); m++) { + vector averages; - float average = 0.0; + //for each window find average + for (int m = 0; m < windows.size(); m++) { - it = trim[i].begin(); //trim[i] is a map of where this sequence was trimmed + float average = 0.0; - //while you are in the window for this sequence - for (int j = windows[i][m]+it->first; j < (windows[i][m]+windowSizes[i]); j++) { average += probabilityProfile[j]; } + //while you are in the window for this sequence + for (int j = window[m]; j < (window[m]+size); j++) { average += probabilityProfile[j]; } - average = average / windowSizes[i]; - //cout << average << endl; - //save this windows average - temp.push_back(average); - } - - //save this qav - averages.push_back(temp); + average = average / size; + + //save this windows average + averages.push_back(average); } - + return averages; } catch(exception& e) { @@ -624,47 +639,66 @@ vector< vector > Pintail::findQav(int start, int end) { exit(1); } } + //*************************************************************************************************************** -vector Pintail::getCoef(int start, int end) { +vector< vector > Pintail::getQuantiles(int start, int end) { try { - vector coefs; - coefs.resize(end-start); + vector< vector > quan; - //find a coef for each sequence - int count = 0; + //for each sequence for(int i = start; i < end; i++){ - //find average prob for this seqs windows - float probAverage = 0.0; - for (int j = 0; j < Qav[i].size(); j++) { probAverage += Qav[i][j]; } - probAverage = probAverage / (float) Qav[i].size(); - cout << "(sum of ai) / m = " << probAverage << endl; - - vector temp = obsDistance[i]; + Sequence* query = templateSeqs[i]; + + //compare to every other sequence in template + for(int j = 0; j < i; j++){ + + Sequence subject = *(templateSeqs[j]); + + map trim = trimSeqs(query, subject, -1); + + + + + + } + - //find observed average - float obsAverage = 0.0; - for (int j = 0; j < temp.size(); j++) { obsAverage += temp[j]; } - obsAverage = obsAverage / (float) temp.size(); -cout << "(sum of oi) / m = " << obsAverage << endl; - float coef = obsAverage / probAverage; - - //save this sequences coefficient - coefs[count] = coef; - count++; } - + return quan; - return coefs; } catch(exception& e) { - errorOut(e, "Pintail", "getCoef"); + errorOut(e, "Pintail", "findQav"); exit(1); } } +//*************************************************************************************************************** +float Pintail::getCoef(vector obs, vector qav) { + try { + + //find average prob for this seqs windows + float probAverage = 0.0; + for (int j = 0; j < qav.size(); j++) { probAverage += qav[j]; } + probAverage = probAverage / (float) Qav.size(); + + //find observed average + float obsAverage = 0.0; + for (int j = 0; j < obs.size(); j++) { obsAverage += obs[j]; } + obsAverage = obsAverage / (float) obs.size(); + + float coef = obsAverage / probAverage; + + return coef; + } + catch(exception& e) { + errorOut(e, "Pintail", "getCoef"); + exit(1); + } +} /**************************************************************************************************/ void Pintail::createProcessesSpots() { @@ -673,7 +707,6 @@ void Pintail::createProcessesSpots() { int process = 0; vector processIDS; vector< vector > win; win.resize(querySeqs.size()); - vector< map > t; t.resize(querySeqs.size()); //loop through and create all the processes you want while (process != processors) { @@ -692,20 +725,13 @@ void Pintail::createProcessesSpots() { //chops off beginning and end of sequences so they both start and end with a base trimSeqs(querySeqs[i], bestfit[i], i); - t[i] = trim[i]; - count++; } - - vector< vector > temp = findWindows(lines[process]->start, lines[process]->end); - - //move into best - count = 0; for (int i = lines[process]->start; i < lines[process]->end; i++) { - win[i] = temp[count]; - count++; + vector temp = findWindows(querySeqs[i], it->first, it->second, windowSizes[i]); + win[i] = temp; } exit(0); @@ -719,9 +745,18 @@ void Pintail::createProcessesSpots() { } windows = win; - trim = t; #else - windows = findWindows(lines[0]->start, lines[0]->end); + bestfit = findPairs(lines[0]->start, lines[0]->end); + for (int j = 0; j < bestfit.size(); j++) { + //chops off beginning and end of sequences so they both start and end with a base + trimSeqs(querySeqs[j], bestfit[j], j); + } + + for (int i = lines[0]->start; i < lines[0]->end; i++) { + it = trimmed[i].begin(); + map win = findWindows(querySeqs[i], it->first, it->second, windowSizes[i]); + windows[i] = win; + } #endif } @@ -743,6 +778,7 @@ void Pintail::createProcesses() { vector< vector > exp; exp.resize(querySeqs.size()); vector de; de.resize(querySeqs.size()); vector< vector > obs; obs.resize(querySeqs.size()); + vector dev; dev.resize(querySeqs.size()); //loop through and create all the processes you want @@ -754,47 +790,30 @@ void Pintail::createProcesses() { process++; }else if (pid == 0){ - vector< vector > temp; - vector tempde; - int count = 0; - - - temp = calcObserved(lines[process]->start, lines[process]->end); - count = 0; - for (int i = lines[process]->start; i < lines[process]->end; i++) { - obs[i] = temp[count]; - count++; - } - - temp = findQav(lines[process]->start, lines[process]->end); - count = 0; - for (int i = lines[process]->start; i < lines[process]->end; i++) { - Qav[i] = temp[count]; - count++; - } - - tempde = getCoef(lines[process]->start, lines[process]->end); - count = 0; + //calc obs for (int i = lines[process]->start; i < lines[process]->end; i++) { - seqCoef[i] = tempde[count]; - count++; - } + vector obsi = calcObserved(querySeqs[i], bestfit[i], windows[i], windowSizes[i]); + obs[i] = obsi; - temp = calcExpected(lines[process]->start, lines[process]->end); - count = 0; - for (int i = lines[process]->start; i < lines[process]->end; i++) { - exp[i] = temp[count]; - count++; + //calc Qav + vector q = findQav(windows[i], windowSizes[i]); + + //get alpha + float alpha = getCoef(obsDistance[i], q); + + //find expected + vector exp = calcExpected(q, alpha); + expectedDistance[i] = exp; + + //get de and deviation + float dei = calcDE(obsi, exp); + de[i] = dei; + + it = trimmed[i].begin(); + float dist = calcDist(querySeqs[i], bestfit[i], it->first, it->second); + dev[i] = dist; } - - tempde = calcDE(lines[process]->start, lines[process]->end); - count = 0; - for (int i = lines[process]->start; i < lines[process]->end; i++) { - de[i] = tempde[count]; - count++; - } - exit(0); }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } } @@ -808,14 +827,55 @@ void Pintail::createProcesses() { obsDistance = obs; expectedDistance = exp; DE = de; + deviation = dev; #else - bestfit = findPairs(lines[0]->start, lines[0]->end); - obsDistance = calcObserved(lines[0]->start, lines[0]->end); - Qav = findQav(lines[0]->start, lines[0]->end); - seqCoef = getCoef(lines[0]->start, lines[0]->end); - expectedDistance = calcExpected(lines[0]->start, lines[0]->end); - DE = calcDE(lines[0]->start, lines[0]->end); + mothurOut("Calculating observed distance... "); cout.flush(); + for (int i = lines[0]->start; i < lines[0]->end; i++) { + vector obsi = calcObserved(querySeqs[i], bestfit[i], windows[i], windowSizes[i]); + obsDistance[i] = obsi; + } + mothurOut("Done."); mothurOutEndLine(); + + + + mothurOut("Finding variability... "); cout.flush(); + for (int i = lines[0]->start; i < lines[0]->end; i++) { + vector q = findQav(windows[i], windowSizes[i]); + Qav[i] = q; + } + mothurOut("Done."); mothurOutEndLine(); + + + + mothurOut("Calculating alpha... "); cout.flush(); + for (int i = lines[0]->start; i < lines[0]->end; i++) { + float alpha = getCoef(obsDistance[i], Qav[i]); + seqCoef.push_back(alpha); + } + mothurOut("Done."); mothurOutEndLine(); + + + + mothurOut("Calculating expected distance... "); cout.flush(); + for (int i = lines[0]->start; i < lines[0]->end; i++) { + vector exp = calcExpected(Qav[i], seqCoef[i]); + expectedDistance[i] = exp; + } + mothurOut("Done."); mothurOutEndLine(); + + + + mothurOut("Finding deviation... "); cout.flush(); + for (int i = lines[0]->start; i < lines[0]->end; i++) { + float de = calcDE(obsDistance[i], expectedDistance[i]); + DE[i] = de; + + it = trimmed[i].begin(); + float dist = calcDist(querySeqs[i], bestfit[i], it->first, it->second); + deviation[i] = dist; + } + mothurOut("Done."); mothurOutEndLine(); #endif } @@ -825,6 +885,46 @@ void Pintail::createProcesses() { } } + +/**************************************************************************************************/ + +void Pintail::createProcessesQuan() { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + vector processIDS; + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); + process++; + }else if (pid == 0){ + + + exit(0); + }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;i lines; vector querySeqs; vector templateSeqs; @@ -53,31 +55,38 @@ class Pintail : public Chimera { vector deviation; //deviation[0] is the percentage of mismatched pairs over the whole seq between querySeqs[0] and its best match. vector< vector > windows; // windows[0] is a vector containing the starting spot in queryseqs[0] aligned sequence for each window. //this is needed so you can move by bases and not just spots in the alignment - vector< map > trim; //trim[0] is the start and end position of trimmed querySeqs[0]. Used to find the variability over each sequence window. vector windowSizes; //windowSizes[0] = window size of querySeqs[0] + vector< map > trimmed; //trimmed[0] = start and stop of trimmed sequences for querySeqs[0] + map::iterator it; + vector< vector > Qav; //Qav[0] is the vector of average variablility for queryseqs[0]... vector seqCoef; //seqCoef[0] is the coeff for queryseqs[0]... vector DE; //DE[0] is the deviaation for queryseqs[0]... vector probabilityProfile; + vector< vector > quantiles; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2... vector readSeqs(string); - void trimSeqs(Sequence*, Sequence&, int); + map trimSeqs(Sequence*, Sequence, int); vector readFreq(); - vector< vector > findQav(int, int); + vector< vector > readQuantiles(); + vector< vector > getQuantiles(int, int); vector calcFreq(vector); - vector getCoef(int, int); + vector findPairs(int, int); - vector< vector > findWindows(int, int); - vector< vector > calcObserved(int, int); - vector< vector > calcExpected(int, int); - vector calcDE(int, int); - vector calcDist(int, int); + vector findWindows(Sequence*, int, int, int&); + vector calcObserved(Sequence*, Sequence, vector, int); + vector calcExpected(vector, float); + vector findQav(vector, int); + float calcDE(vector, vector); + float calcDist(Sequence*, Sequence, int, int); + float getCoef(vector, vector); void createProcessesSpots(); void createProcesses(); + void createProcessesQuan(); diff --git a/sequence.cpp b/sequence.cpp index 685f072..10a9648 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -21,10 +21,10 @@ Sequence::Sequence(string newName, string sequence) { initialize(); name = newName; - if(sequence.find_first_of('-') != string::npos) { - setAligned(sequence); - } + + //setUnaligned removes any gap characters for us setUnaligned(sequence); + setAligned(sequence); } //******************************************************************************************************************** @@ -53,10 +53,9 @@ Sequence::Sequence(ifstream& fastaFile){ } } - if((sequence.find_first_of('-') != string::npos) || (sequence.find_first_of('.') != string::npos)) { // if there are any gaps in the sequence, assume that it is - setAligned(sequence); // an alignment file - } - setUnaligned(sequence); // also set the unaligned sequence file + setAligned(sequence); + //setUnaligned removes any gap characters for us + setUnaligned(sequence); } //******************************************************************************************************************** -- 2.39.2