From 7597c66c08bac938fedcacc1f760186bc077b3f1 Mon Sep 17 00:00:00 2001 From: westcott Date: Sun, 26 Jul 2009 23:56:29 +0000 Subject: [PATCH] fixed bug with aligner in database class, worked on chimeras --- Mothur.xcodeproj/project.pbxproj | 1 + chimera.h | 27 ++ chimeraseqscommand.cpp | 16 +- chimeraseqscommand.h | 2 +- database.cpp | 21 +- decalc.cpp | 403 ++++++++++++++++++++++ decalc.h | 20 +- mothur.cpp | 4 +- pintail.cpp | 557 +++++++++---------------------- pintail.h | 18 +- 10 files changed, 641 insertions(+), 428 deletions(-) diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index f1b5841..ab3240e 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -1238,6 +1238,7 @@ ); PREBINDING = NO; SDKROOT = "$(DEVELOPER_SDK_DIR)/MacOSX10.5.sdk"; + SYMROOT = /Users/johnwestcott/Desktop; WARNING_CFLAGS = ""; }; name = Release; diff --git a/chimera.h b/chimera.h index 2dc2989..106d24c 100644 --- a/chimera.h +++ b/chimera.h @@ -62,6 +62,32 @@ class Chimera { } + virtual void setMask(string filename) { + try { + + if (filename == "") { + //default is from wigeon 236627 EU009184.1 Shigella dysenteriae str. FBD013 + seqMask = ".....................................................................................................AAATTGAAGAGTTT-GA--T-CA-T-G-GCTC-AG-AT-TGAA-C-GC--TGG-C--G-GC-A-GG--C----C-T--AACACA-T-GC-A-AGT-CGA-A-CG----------G-TAA-CA-G----------------------------GAAG-A-AG----------------------------------------------------CTT-G----------------------------------------------------------------------------------CT-TCTTT----------------G-CT--G--AC--G--AG-T-GG-C-GG-A--C-------------GGG-TGAGT-A--AT-GT-C-T-G-GG---A-A--A-CT-G--C-C-TGA--TG-G------------------------------------------------------------------A-GG----GGG-AT-AA-CTA-------------------------C-T-G-----------------------GAA-A---CGG-TAG-CTAA-TA---CC-G--C-AT-A----------A--------------------C-------------------------------------GT-C-----------------------------------------------------------------------------------------------------------------------G-CA-A--------------------------------------------------------------------------------------------------------------------------------------G-A-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CAAA--G-A-G-GG-----G--GA-C-CT--------------------------------------------------------------------------------------------------------------------TCG-G----------------------------------------------------------------------------------------------------------------------G----CC-TC--T---T-G--------------C----C-A---T-CG-G---AT---G-T-----G-CCC-AGA--T-GGG--A------TT--A--G-CT-A----G---TAGG-T-G-GG-G-T----AAC-GG-C-T-C-ACCT--A-GG-C-G--A-CG-A------------TCC-C-T------AG-CT-G-G-TCT-G-AG----A--GG-AT--G-AC-C-AG-CCAC-A-CTGGA--A-C-TG-A-GA-C-AC-G-G-TCCAGA-CTCC-TAC-G--G-G-A-G-GC-A-GC-A-G-TG---GG-G-A-ATA-TTGCA-C-AA-T-GG--GC-GC-A----A-G-CC-T-GA-TG-CA-GCCA-TGCC-G-CG-T---G-T-A--T--GA-A-G--A--A-G-G-CC-----TT-CG---------G-G-T-T-G-T--A---AA-G-TAC--------TT-TC-A-G--C-GGG----GA-G--G---AA-GGGA---GTAA-AG----T--T--AA-T---A----C-----CT-T-TGC-TCA-TT-GA-CG-TT-A-C-CC-G-CA-G---------AA-----------GAAGC-ACC-GG-C-TAA---C--T-CCGT--GCCA--G-C---A--GCCG---C-GG--TA-AT--AC---GG-AG-GGT-GCA-A-G-CG-TTAA-T-CGG-AA-TT-A--C-T--GGGC-GTA----AA-GCGC-AC--G-CA-G-G-C-G------------G--T-TT-G-T-T-AA----G-T-C-A---G-ATG-TG-A-AA-TC--CC-CGG-G--------------------------------------------------------------------CT-C-AA-------------------------------------------------------------------------CC-T-G-GG-AA-C----T-G-C-A-T-C--------T--GA-T-A-C-T-G-GCA--A-G-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------T-T-G-A-G-T-C-----T-CG--TA-G-A------------G-GG-G-GG-T----AG--AATT-CCA-G-GT--GT-A-GCG-GTGAAA-TG-CGT-AGAG-A-TC-T-GGA--GG-A-AT-A-CC-GG--T--G--GC-GAA-G--G-C---G----G--C-C-CCCTG------G-AC-GA--------------------------------------------------------------AG-A-C-T--GA--CG-----CT-CA-GG--T-G-CGA--AA-G-C--------------G-TGGG-GAG-C-A-AACA--GG-ATTA-G-ATA-C-----CC-T-G-GTA-G-T----C-CA--C-G-CCG-T-AAA--C-GATG-TC--GA-CT---------T-GG--A--G-G-TT-G-TG-C--C--------------------------------------------------------------------------------------CTT-GA--------------------------------------------------------------------------------------------------------------------------------------------------G-G-C-GT--G-G-C-T-TC-C------GG--A----GC-TAA--CG-C-G-T--T--AA-GT--C----G-ACC-GCC-T-G-GG-GAG-TA---CGG-----C-C--G-C-A-A-GGT-T--AAA-ACTC-AAA---------TGAA-TTG-ACGGG-G-G-CCCG----C-A--C-A-A-GCG-GT-G--G--AG-CA-T--GT-GGT-TT-AATT-C-G-ATG-CAAC-G-CG-A-AG-A-A-CC-TT-A-CC-TGGTC-TT-G-AC-A-T-C--------------CAC-G-G-------------A-AG-T-T-T--TC--A-GA-G-A-T--G-A-G--A-A-T-G--T-G-----CC-------------------------------------T--TC-G------------------------------------------GG----A----A---CC-GTG---A--GA---------------------------------------------------C-A-G-G-T-GCTG-CA-TGG-CT--GTC-GTC-A-GC-TC---G-TG-TT-G--TGA-AA-TGT-T-GG-G-TT-AA-GT-CCCGC-AA--------C-GAG-CGC-A-ACC-C-T-TA--TC--C-TTTG--T-T-G-C-C---AG-C-G-----G-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------TCC------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GG---C----C-G------------G----G---A-A--CT---------------C-A-A-A-G-GA-G--AC-T-G-CCA--G-T------------------------------------G-A---TAA----------------------------------A-C-T-G--G-A-GG-A--AGG-T--GGGG-A-TGAC-GTC--AAGT-C---ATC-A-T-G-G-C-C-CTT----AC-G--AC-C-A-GG-GC-TA-CAC-ACGTG-C--TA--CAATG---G-CGCA-T-A--C-AAA-GA-GA--------------------------------------------------------------------------------------------------A-G-C-G-A--C-CTCG-C--G---------------------------------------A-GA-G-C-----------A--A-G-CG---G----------A--CCT-C------A-T-AAAGT-GC-G-T-C-G-TAG-TCC--------GGA-T-TGGAG-TC--T-GCAA-CT-C-------------------------------------------------------------------------------------------------G-ACTCC-A-T-G-AA-G-TC-GGAAT-CG-C-TA--G-TA-AT-C-G-T----GGA-TC-A-G--A------AT--GCC-AC-G-GT-G-AAT-ACGT-T-CCCGGGCCT-TGTA----CACACCG-CCC-GTC-----A---CA--CCA-TG-GG-A--G---TGG-G-TT-GC-AAA--A-GAA------G--T-AGG-TA-G-C-T-T-AA-C-C--------------------------------------------------------------TT----C-------------------------------------------------------------------------------------------------G--GG-A--GG-G--C---GC-TTA--CC--ACT-T----T-GTG-AT-TCA------------------------TG--ACT-GGGG-TG-AAG-TCGTAACAA-GGTAA-CCGT-AGGGGAA-CCTG-CGGT-TGGATCACCTCCTTA................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................"; + }else{ + ifstream infile; + openInputFile(filename, infile); + + while (!infile.eof()) { + Sequence temp(infile); + seqMask = temp.getAligned(); + + gobble(infile); + } + + infile.close(); + } + } + catch(exception& e) { + errorOut(e, "Chimera", "setMask"); + exit(1); + } + } + //pure functions virtual void getChimeras() = 0; virtual void print(ostream&) = 0; @@ -70,6 +96,7 @@ class Chimera { bool filter, correction; int processors, window, increment; + string seqMask; }; diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index aaed3ba..ba980a5 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -22,7 +22,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else { //valid paramters for this command - string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile" }; + string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -50,7 +50,12 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ quanfile = validParameter.validFile(parameters, "quantile", true); if (quanfile == "not open") { abort = true; } - else if (quanfile == "not found") { quanfile = ""; } + else if (quanfile == "not found") { quanfile = ""; } + + maskfile = validParameter.validFile(parameters, "mask", true); + if (maskfile == "not open") { abort = true; } + else if (maskfile == "not found") { maskfile = ""; } + string temp; @@ -86,11 +91,12 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ void ChimeraSeqsCommand::help(){ try { mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n"); - mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors and method. fasta is required.\n"); + mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask and method. fasta is required.\n"); mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter. The default is false. \n"); mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs. The default is true. \n"); mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. \n"); + mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the pintail and mallard method. The default is 236627 EU009184.1 Shigella dysenteriae str. FBD013. \n"); mothurOut("The chimera.seqs command should be in the following format: \n"); mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n"); mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n"); @@ -122,6 +128,10 @@ int ChimeraSeqsCommand::execute(){ //saves time to avoid generating it if (quanfile != "") { chimera->setQuantiles(quanfile); } else { chimera->setQuantiles(""); } + + if (maskfile == "") { mothurOut("You have not provided a mask, so I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); mothurOutEndLine(); } + chimera->setMask(maskfile); + }else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; } //set user options diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h index 7721ef8..85d5732 100644 --- a/chimeraseqscommand.h +++ b/chimeraseqscommand.h @@ -28,7 +28,7 @@ public: private: bool abort; - string method, fastafile, templatefile, consfile, quanfile; + string method, fastafile, templatefile, consfile, quanfile, maskfile; bool filter, correction; int processors, midpoint, averageLeft, averageRight, window, iters, increment; Chimera* chimera; diff --git a/database.cpp b/database.cpp index 534747a..9784ef9 100644 --- a/database.cpp +++ b/database.cpp @@ -24,12 +24,12 @@ Database::Database(string fastaFileName){ // This assumes that the template dat mothurOut("Reading in the " + fastaFileName + " template sequences...\t"); cout.flush(); //all of this is elsewhere already! - numSeqs=count(istreambuf_iterator(fastaFile),istreambuf_iterator(), '>'); // count the number of - fastaFile.seekg(0); // sequences + //numSeqs=count(istreambuf_iterator(fastaFile),istreambuf_iterator(), '>'); // count the number of + //fastaFile.seekg(0); // sequences - templateSequences.resize(numSeqs); + //templateSequences.resize(numSeqs); - string seqName, sequence; + /*string seqName, sequence; for(int i=0;i> seqName; seqName = seqName.substr(1); @@ -52,8 +52,21 @@ Database::Database(string fastaFileName){ // This assumes that the template dat if (templateSequences[i].getUnaligned().length() > longest) { longest = templateSequences[i].getUnaligned().length(); } fastaFile.putback(letter); + }*/ + + while (!fastaFile.eof()) { + Sequence temp(fastaFile); + + templateSequences.push_back(temp); + + //save longest base + if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length(); } + + gobble(fastaFile); } + numSeqs = templateSequences.size(); + fastaFile.close(); //all of this is elsewhere already! diff --git a/decalc.cpp b/decalc.cpp index 7e4a494..d57f435 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -9,3 +9,406 @@ #include "decalc.h" +//*************************************************************************************************************** +void DeCalculator::setMask(string m) { + try { + seqMask = m; + + //whereever there is a base in the mask, save that value is query and subject + for (int i = 0; i < seqMask.length(); i++) { + if (isalpha(seqMask[i])) { + h.insert(i); + } + } + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "setMask"); + exit(1); + } +} +//*************************************************************************************************************** +void DeCalculator::runMask(Sequence* seq) { + try{ + + string q = seq->getAligned(); + string tempQuery = ""; + + //whereever there is a base in the mask, save that value is query and subject + set::iterator setit; + for ( setit=h.begin() ; setit != h.end(); setit++ ) { + tempQuery += q[*setit]; + } + + //save masked values + seq->setAligned(tempQuery); + seq->setUnaligned(tempQuery); + } + catch(exception& e) { + errorOut(e, "DeCalculator", "runMask"); + exit(1); + } +} +//*************************************************************************************************************** +//num is query's spot in querySeqs +void DeCalculator::trimSeqs(Sequence* query, Sequence subject, map& trim) { + try { + + string q = query->getAligned(); + string s = subject.getAligned(); + + int front = 0; + for (int i = 0; i < q.length(); i++) { + if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; } + } + + int back = 0; + for (int i = q.length(); i >= 0; i--) { + if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; } + } + + trim[front] = back; + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "trimSeqs"); + exit(1); + } +} +//*************************************************************************************************************** +//find the window breaks for each sequence - this is so you can move ahead by bases. +vector DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) { + try { + + vector win; + + int cutoff = back - front; //back - front + + //if window is set to default + if (size == 0) { if (cutoff > 1200) { size = 300; } + else{ size = (cutoff / 4); } } + else if (size > (cutoff / 4)) { + mothurOut("You have selected to large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine(); + size = (cutoff / 4); + } + + string seq = query->getAligned().substr(front, cutoff); + + //count bases + int numBases = 0; + for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } } + + //save start of seq + win.push_back(front); + + //move ahead increment bases at a time until all bases are in a window + int countBases = 0; + int totalBases = 0; //used to eliminate window of blanks at end of sequence + + seq = query->getAligned(); + for (int m = front; m < (back - size) ; m++) { + + //count number of bases you see + if (isalpha(seq[m])) { countBases++; totalBases++; } + + //if you have seen enough bases to make a new window + if (countBases >= increment) { + win.push_back(m); //save spot in alignment + countBases = 0; //reset bases you've seen in this window + } + + //no need to continue if all your bases are in a window + if (totalBases == numBases) { break; } + } + + return win; + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "findWindows"); + exit(1); + } +} + +//*************************************************************************************************************** +vector DeCalculator::calcObserved(Sequence* query, Sequence subject, vector window, int size) { + try { + + vector temp; +//cout << "query length = " << query->getAligned().length() << '\t' << " subject length = " << subject.getAligned().length() << endl; + for (int m = 0; m < window.size(); m++) { + + string seqFrag = query->getAligned().substr(window[m], size); + string seqFragsub = subject.getAligned().substr(window[m], size); + //cout << "start point = " << window[m] << " end point = " << window[m]+size << endl; + int diff = 0; + for (int b = 0; b < seqFrag.length(); b++) { + + if (seqFrag[b] != seqFragsub[b]) { diff++; } + } + + //percentage of mismatched bases + float dist; + dist = diff / (float) seqFrag.length() * 100; + + temp.push_back(dist); + } + + return temp; + } + catch(exception& e) { + errorOut(e, "DeCalculator", "calcObserved"); + exit(1); + } +} +//*************************************************************************************************************** +float DeCalculator::calcDist(Sequence* query, Sequence subject, int front, int back) { + try { + + //so you only look at the trimmed part of the sequence + int cutoff = back - front; + + //from first startpoint with length back-front + string seqFrag = query->getAligned().substr(front, cutoff); + string seqFragsub = subject.getAligned().substr(front, cutoff); + + int diff = 0; + for (int b = 0; b < seqFrag.length(); b++) { + if (seqFrag[b] != seqFragsub[b]) { diff++; } + } + + //percentage of mismatched bases + float dist = diff / (float) seqFrag.length() * 100; + + return dist; + } + catch(exception& e) { + errorOut(e, "DeCalculator", "calcDist"); + exit(1); + } +} + +//*************************************************************************************************************** +vector DeCalculator::calcExpected(vector qav, float coef) { + try { + + //for each window + vector queryExpected; + + for (int m = 0; m < qav.size(); m++) { + + float expected = qav[m] * coef; + + queryExpected.push_back(expected); + } + + return queryExpected; + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "calcExpected"); + exit(1); + } +} +//*************************************************************************************************************** +float DeCalculator::calcDE(vector obs, vector exp) { + try { + + //for each window + float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2 + for (int m = 0; m < obs.size(); m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); } + + float de = sqrt((sum / (obs.size() - 1))); + + return de; + } + catch(exception& e) { + errorOut(e, "DeCalculator", "calcDE"); + exit(1); + } +} + +//*************************************************************************************************************** + +vector DeCalculator::calcFreq(vector seqs, string filename) { + try { + + vector prob; + string freqfile = getRootName(filename) + "prob"; + ofstream outFreq; + + openOutputFile(freqfile, outFreq); + + //at each position in the sequence + for (int i = 0; i < seqs[0]->getAligned().length(); i++) { + + vector freq; freq.resize(4,0); + int gaps = 0; + + //find the frequency of each nucleotide + for (int j = 0; j < seqs.size(); j++) { + + char value = seqs[j]->getAligned()[i]; + + if(toupper(value) == 'A') { freq[0]++; } + else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; } + else if(toupper(value) == 'G') { freq[2]++; } + else if(toupper(value) == 'C') { freq[3]++; } + else { gaps++; } + } + + //find base with highest frequency + int highest = 0; + for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } } + + float highFreq; + //subtract gaps to "ignore them" + if ( (seqs.size() - gaps) == 0 ) { highFreq = 1.0; } + else { highFreq = highest / (float) (seqs.size() - gaps); } + + float Pi; + Pi = (highFreq - 0.25) / 0.75; + + //cannot have probability less than 0. + if (Pi < 0) { Pi = 0.0; } + + //saves this for later + outFreq << i+1 << '\t' << Pi << endl; + + prob.push_back(Pi); + } + + outFreq.close(); + + return prob; + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "calcFreq"); + exit(1); + } +} +//*************************************************************************************************************** +vector DeCalculator::findQav(vector window, int size, vector probabilityProfile) { + try { + vector averages; + + //for each window find average + for (int m = 0; m < window.size(); m++) { + + float average = 0.0; + + //while you are in the window for this sequence + int count = 0; + for (int j = window[m]; j < (window[m]+size); j++) { + + //is this a spot that is included in the mask + if (h.count(j) > 0) { + average += probabilityProfile[j]; + count++; + } + } + + average = average / count; + + //save this windows average + averages.push_back(average); + } + + return averages; + } + catch(exception& e) { + errorOut(e, "DeCalculator", "findQav"); + exit(1); + } +} + +//*************************************************************************************************************** +vector< vector > DeCalculator::getQuantiles(vector seqs, vector windowSizesTemplate, int window, vector probProfile, int increment, int start, int end) { + try { + vector< vector > quan; + + //percentage of mismatched pairs 1 to 100 + quan.resize(100); + + + //for each sequence + for(int i = start; i < end; i++){ + + mothurOut("Processing template sequence " + toString(i)); mothurOutEndLine(); + Sequence* query = seqs[i]; + + //compare to every other sequence in template + for(int j = 0; j < i; j++){ + + Sequence subject = *(seqs[j]); + + map trim; + map::iterator it; + + trimSeqs(query, subject, trim); + + it = trim.begin(); + int front = it->first; int back = it->second; + + //reset window for each new comparison + windowSizesTemplate[i] = window; + + vector win = findWindows(query, front, back, windowSizesTemplate[i], increment); + + vector obsi = calcObserved(query, subject, win, windowSizesTemplate[i]); + + vector q = findQav(win, windowSizesTemplate[i], probProfile); + + float alpha = getCoef(obsi, q); + + vector exp = calcExpected(q, alpha); + + float de = calcDE(obsi, exp); + + float dist = calcDist(query, subject, front, back); + + dist = ceil(dist); + + //dist-1 because vector indexes start at 0. + quan[dist-1].push_back(de); + + } + } + + return quan; + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "findQav"); + exit(1); + } +} + +//*************************************************************************************************************** +float DeCalculator::getCoef(vector obs, vector qav) { + try { + + //find average prob for this seqs windows + float probAverage = 0.0; + for (int j = 0; j < qav.size(); j++) { probAverage += qav[j]; } + probAverage = probAverage / (float) qav.size(); + + //find observed average + float obsAverage = 0.0; + for (int j = 0; j < obs.size(); j++) { obsAverage += obs[j]; } + obsAverage = obsAverage / (float) obs.size(); +//cout << "sum ai / m = " << probAverage << endl; +//cout << "sum oi / m = " << obsAverage << endl; + float coef = obsAverage / probAverage; + + return coef; + } + catch(exception& e) { + errorOut(e, "DeCalculator", "getCoef"); + exit(1); + } +} +//*************************************************************************************************************** + + diff --git a/decalc.h b/decalc.h index 5818539..dbdf316 100644 --- a/decalc.h +++ b/decalc.h @@ -23,22 +23,26 @@ class DeCalculator { public: - - void trimSeqs(Sequence*, Sequence, map&); - vector readFreq(); - vector calcFreq(vector); - vector findPairs(int, int); - vector findWindows(Sequence*, int, int, int&); + DeCalculator() {}; + ~DeCalculator() {}; + + void setMask(string m); + void runMask(Sequence*); + void trimSeqs(Sequence*, Sequence, map&); + vector calcFreq(vector, string); + vector findWindows(Sequence*, int, int, int&, int); vector calcObserved(Sequence*, Sequence, vector, int); vector calcExpected(vector, float); - vector findQav(vector, int); + vector findQav(vector, int, vector); float calcDE(vector, vector); float calcDist(Sequence*, Sequence, int, int); float getCoef(vector, vector); + vector< vector > getQuantiles(vector, vector, int, vector, int, int, int); private: - + string seqMask; + set h; }; diff --git a/mothur.cpp b/mothur.cpp index 0cd4faf..9208a25 100644 --- a/mothur.cpp +++ b/mothur.cpp @@ -18,14 +18,13 @@ GlobalData* GlobalData::_uniqueInstance = 0; int main(int argc, char *argv[]){ try { - system("clear"); - //remove old logfile string logFileName = "mothur.logFile"; remove(logFileName.c_str()); //version #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + system("clear"); #if defined (__APPLE__) || (__MACH__) mothurOutJustToLog("Mac version"); mothurOutEndLine(); mothurOutEndLine(); @@ -35,6 +34,7 @@ int main(int argc, char *argv[]){ #endif #else + system("CLS"); mothurOutJustToLog("Windows version"); mothurOutEndLine(); mothurOutEndLine(); #endif diff --git a/pintail.cpp b/pintail.cpp index a204294..bcdc679 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -39,6 +39,9 @@ void Pintail::print(ostream& out) { else { chimera = "No"; } out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl; + if (chimera == "Yes") { + mothurOut(querySeqs[i]->getName() + "\tdiv: " + toString(deviation[i]) + "\tstDev: " + toString(DE[i]) + "\tchimera flag: " + chimera); mothurOutEndLine(); + } out << "Observed\t"; for (int j = 0; j < obsDistance[i].size(); j++) { out << obsDistance[i][j] << '\t'; } @@ -80,10 +83,11 @@ void Pintail::getChimeras() { windowSizes.resize(numSeqs, window); windowSizesTemplate.resize(templateSeqs.size(), window); windowsForeachQuery.resize(numSeqs); + h.resize(numSeqs); quantiles.resize(100); //one for every percent mismatch //break up file if needed - int linesPerProcess = processors / numSeqs; + int linesPerProcess = numSeqs / processors ; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) //find breakup of sequences for all times we will Parallelize @@ -113,14 +117,28 @@ void Pintail::getChimeras() { #endif distcalculator = new ignoreGaps(); - + decalc = new DeCalculator(); + + decalc->setMask(seqMask); + + //mask querys + for (int i = 0; i < querySeqs.size(); i++) { + decalc->runMask(querySeqs[i]); + } + + //mask templates + for (int i = 0; i < templateSeqs.size(); i++) { + decalc->runMask(templateSeqs[i]); + } + +for (int i = 0; i < lines.size(); i++) { cout << "line pair " << i << " = " << lines[i]->start << '\t' << lines[i]->end << endl; } if (processors == 1) { mothurOut("Finding closest sequence in template to each sequence... "); cout.flush(); bestfit = findPairs(lines[0]->start, lines[0]->end); //ex.align matches from wigeon -/*for (int m = 0; m < templateSeqs.size(); m++) { +for (int m = 0; m < templateSeqs.size(); m++) { if (templateSeqs[m]->getName() == "159481") { bestfit[17] = *(templateSeqs[m]); } if (templateSeqs[m]->getName() == "100137") { bestfit[16] = *(templateSeqs[m]); } if (templateSeqs[m]->getName() == "112956") { bestfit[15] = *(templateSeqs[m]); } @@ -141,19 +159,19 @@ void Pintail::getChimeras() { if (templateSeqs[m]->getName() == "141312") { bestfit[0] = *(templateSeqs[m]); } -}*/ +} for (int j = 0; j < bestfit.size(); j++) { //chops off beginning and end of sequences so they both start and end with a base - trimSeqs(querySeqs[j], bestfit[j], trimmed[j]); + decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]); } mothurOut("Done."); mothurOutEndLine(); mothurOut("Finding window breaks... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { it = trimmed[i].begin(); -cout << "trimmed = " << it->first << '\t' << it->second << endl; - vector win = findWindows(querySeqs[i], it->first, it->second, windowSizes[i]); +//cout << "trimmed = " << it->first << '\t' << it->second << endl; + vector win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment); windowsForeachQuery[i] = win; } mothurOut("Done."); mothurOutEndLine(); @@ -164,7 +182,7 @@ cout << "trimmed = " << it->first << '\t' << it->second << endl; mothurOut("Getting conservation... "); cout.flush(); if (consfile == "") { mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the quantiles to a .prob file so that you can input them using the conservation parameter next time you run this command. Providing the .prob file will dramatically improve speed. "); cout.flush(); - probabilityProfile = calcFreq(templateSeqs); + probabilityProfile = decalc->calcFreq(templateSeqs, templateFile); mothurOut("Done."); mothurOutEndLine(); }else { probabilityProfile = readFreq(); } @@ -176,8 +194,8 @@ cout << "trimmed = " << it->first << '\t' << it->second << endl; mothurOut("Calculating observed distance... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { - cout << querySeqs[i]->getName() << '\t' << bestfit[i].getName() << " windows = " << windowsForeachQuery[i].size() << " size = " << windowSizes[i] << endl; - vector obsi = calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]); + //cout << querySeqs[i]->getName() << '\t' << bestfit[i].getName() << " windows = " << windowsForeachQuery[i].size() << " size = " << windowSizes[i] << endl; + vector obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]); obsDistance[i] = obsi; } mothurOut("Done."); mothurOutEndLine(); @@ -185,23 +203,31 @@ cout << "trimmed = " << it->first << '\t' << it->second << endl; mothurOut("Finding variability... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { - vector q = findQav(windowsForeachQuery[i], windowSizes[i]); + vector q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile); + Qav[i] = q; +//cout << i+1 << endl; +//for (int j = 0; j < Qav[i].size(); j++) { + //cout << Qav[i][j] << '\t'; +//} +//cout << endl << endl; + } mothurOut("Done."); mothurOutEndLine(); mothurOut("Calculating alpha... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { - float alpha = getCoef(obsDistance[i], Qav[i]); - seqCoef.push_back(alpha); + float alpha = decalc->getCoef(obsDistance[i], Qav[i]); +//cout << i+1 << "\tcoef = " << alpha << endl; + seqCoef[i] = alpha; } mothurOut("Done."); mothurOutEndLine(); mothurOut("Calculating expected distance... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { - vector exp = calcExpected(Qav[i], seqCoef[i]); + vector exp = decalc->calcExpected(Qav[i], seqCoef[i]); expectedDistance[i] = exp; } mothurOut("Done."); mothurOutEndLine(); @@ -209,11 +235,11 @@ cout << "trimmed = " << it->first << '\t' << it->second << endl; mothurOut("Finding deviation... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { - float de = calcDE(obsDistance[i], expectedDistance[i]); + float de = decalc->calcDE(obsDistance[i], expectedDistance[i]); DE[i] = de; it = trimmed[i].begin(); - float dist = calcDist(querySeqs[i], bestfit[i], it->first, it->second); + float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); deviation[i] = dist; } mothurOut("Done."); mothurOutEndLine(); @@ -230,7 +256,7 @@ cout << "trimmed = " << it->first << '\t' << it->second << endl; mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); if (processors == 1) { - quantiles = getQuantiles(0, templateSeqs.size()); + quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); }else { createProcessesQuan(); } ofstream out4; @@ -281,38 +307,13 @@ cout << "trimmed = " << it->first << '\t' << it->second << endl; for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } delete distcalculator; + delete decalc; } catch(exception& e) { errorOut(e, "Pintail", "getChimeras"); exit(1); } } -//*************************************************************************************************************** -//num is query's spot in querySeqs -void Pintail::trimSeqs(Sequence* query, Sequence subject, map& trim) { - try { - - string q = query->getAligned(); - string s = subject.getAligned(); - - int front = 0; - for (int i = 0; i < q.length(); i++) { - if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; } - } - - int back = 0; - for (int i = q.length(); i >= 0; i--) { - if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; } - } - - trim[front] = back; - - } - catch(exception& e) { - errorOut(e, "Pintail", "trimSeqs"); - exit(1); - } -} //*************************************************************************************************************** @@ -391,13 +392,14 @@ vector< vector > Pintail::readQuantiles() { vector Pintail::findPairs(int start, int end) { try { - vector seqsMatches; seqsMatches.resize(end-start); + vector seqsMatches; for(int i = start; i < end; i++){ float smallest = 10000.0; Sequence query = *(querySeqs[i]); - + Sequence match; + for(int j = 0; j < templateSeqs.size(); j++){ Sequence temp = *(templateSeqs[j]); @@ -406,10 +408,12 @@ vector Pintail::findPairs(int start, int end) { float dist = distcalculator->getDist(); if (dist < smallest) { - seqsMatches[i] = *(templateSeqs[j]); + match = *(templateSeqs[j]); smallest = dist; } } + + seqsMatches.push_back(match); } return seqsMatches; @@ -421,329 +425,6 @@ vector Pintail::findPairs(int start, int end) { } } -//*************************************************************************************************************** -//find the window breaks for each sequence - this is so you can move ahead by bases. -vector Pintail::findWindows(Sequence* query, int front, int back, int& size) { - try { - - vector win; - - int cutoff = back - front; //back - front - - //if window is set to default - if (size == 0) { if (cutoff > 1200) { size = 300; } - else{ size = (cutoff / 4); } } - else if (size > (cutoff / 4)) { - mothurOut("You have selected to large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine(); - size = (cutoff / 4); - } - - string seq = query->getAligned().substr(front, cutoff); - - //count bases - int numBases = 0; - for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } } - - //save start of seq - win.push_back(front); - - //move ahead increment bases at a time until all bases are in a window - int countBases = 0; - int totalBases = 0; //used to eliminate window of blanks at end of sequence - - seq = query->getAligned(); - for (int m = front; m < (back - size) ; m++) { - - //count number of bases you see - if (isalpha(seq[m])) { countBases++; totalBases++; } - - //if you have seen enough bases to make a new window - if (countBases >= increment) { - win.push_back(m); //save spot in alignment - countBases = 0; //reset bases you've seen in this window - } - - //no need to continue if all your bases are in a window - if (totalBases == numBases) { break; } - } - - return win; - - } - catch(exception& e) { - errorOut(e, "Pintail", "findWindows"); - exit(1); - } -} - -//*************************************************************************************************************** -vector Pintail::calcObserved(Sequence* query, Sequence subject, vector window, int size) { - try { - - vector temp; -//cout << "query length = " << query->getAligned().length() << '\t' << " subject length = " << subject.getAligned().length() << endl; - for (int m = 0; m < window.size(); m++) { - - string seqFrag = query->getAligned().substr(window[m], size); - string seqFragsub = subject.getAligned().substr(window[m], size); - //cout << "start point = " << window[m] << " end point = " << window[m]+size << endl; - int diff = 0; - for (int b = 0; b < seqFrag.length(); b++) { - if (seqFrag[b] != seqFragsub[b]) { diff++; } - } - - //percentage of mismatched bases - float dist; - dist = diff / (float) seqFrag.length() * 100; - - temp.push_back(dist); - } - - return temp; - } - catch(exception& e) { - errorOut(e, "Pintail", "calcObserved"); - exit(1); - } -} -//*************************************************************************************************************** -float Pintail::calcDist(Sequence* query, Sequence subject, int front, int back) { - try { - - //so you only look at the trimmed part of the sequence - int cutoff = back - front; - - //from first startpoint with length back-front - string seqFrag = query->getAligned().substr(front, cutoff); - string seqFragsub = subject.getAligned().substr(front, cutoff); - - int diff = 0; - for (int b = 0; b < seqFrag.length(); b++) { - if (seqFrag[b] != seqFragsub[b]) { diff++; } - } - - //percentage of mismatched bases - float dist = diff / (float) seqFrag.length() * 100; - - return dist; - } - catch(exception& e) { - errorOut(e, "Pintail", "calcDist"); - exit(1); - } -} - -//*************************************************************************************************************** -vector Pintail::calcExpected(vector qav, float coef) { - try { - - //for each window - vector queryExpected; - - for (int m = 0; m < qav.size(); m++) { - - float expected = qav[m] * coef; - - queryExpected.push_back(expected); - } - - return queryExpected; - - } - catch(exception& e) { - errorOut(e, "Pintail", "calcExpected"); - exit(1); - } -} -//*************************************************************************************************************** -float Pintail::calcDE(vector obs, vector exp) { - try { - - //for each window - float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2 - for (int m = 0; m < obs.size(); m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); } - - float de = sqrt((sum / (obs.size() - 1))); - - return de; - } - catch(exception& e) { - errorOut(e, "Pintail", "calcDE"); - exit(1); - } -} - -//*************************************************************************************************************** - -vector Pintail::calcFreq(vector seqs) { - try { - - vector prob; - string freqfile = getRootName(templateFile) + "prob"; - ofstream outFreq; - - openOutputFile(freqfile, outFreq); - - //at each position in the sequence - for (int i = 0; i < seqs[0]->getAligned().length(); i++) { - - vector freq; freq.resize(4,0); - int gaps = 0; - - //find the frequency of each nucleotide - for (int j = 0; j < seqs.size(); j++) { - - char value = seqs[j]->getAligned()[i]; - - if(toupper(value) == 'A') { freq[0]++; } - else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; } - else if(toupper(value) == 'G') { freq[2]++; } - else if(toupper(value) == 'C') { freq[3]++; } - else { gaps++; } - } - - //find base with highest frequency - int highest = 0; - for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } } - - float highFreq; - //subtract gaps to "ignore them" - if ( (seqs.size() - gaps) == 0 ) { highFreq = 1.0; } - else { highFreq = highest / (float) (seqs.size() - gaps); } - - float Pi; - Pi = (highFreq - 0.25) / 0.75; - - //cannot have probability less than 0. - if (Pi < 0) { Pi = 0.0; } - - //saves this for later - outFreq << i+1 << '\t' << Pi << endl; - - prob.push_back(Pi); - } - - outFreq.close(); - - return prob; - - } - catch(exception& e) { - errorOut(e, "Pintail", "calcFreq"); - exit(1); - } -} -//*************************************************************************************************************** -vector Pintail::findQav(vector window, int size) { - try { - vector averages; - - //for each window find average - for (int m = 0; m < window.size(); m++) { - - float average = 0.0; - - //while you are in the window for this sequence - for (int j = window[m]; j < (window[m]+size); j++) { average += probabilityProfile[j]; } - - average = average / size; - - //save this windows average - averages.push_back(average); - } - - return averages; - } - catch(exception& e) { - errorOut(e, "Pintail", "findQav"); - exit(1); - } -} - -//*************************************************************************************************************** -vector< vector > Pintail::getQuantiles(int start, int end) { - try { - vector< vector > quan; - - //percentage of mismatched pairs 1 to 100 - quan.resize(100); - - - //for each sequence - for(int i = start; i < end; i++){ - - mothurOut("Processing template sequence " + toString(i)); mothurOutEndLine(); - Sequence* query = templateSeqs[i]; - - //compare to every other sequence in template - for(int j = 0; j < i; j++){ - - Sequence subject = *(templateSeqs[j]); - - map trim; - trimSeqs(query, subject, trim); - - it = trim.begin(); - int front = it->first; int back = it->second; - - //reset window for each new comparison - windowSizesTemplate[i] = window; - - vector win = findWindows(query, front, back, windowSizesTemplate[i]); - - vector obsi = calcObserved(query, subject, win, windowSizesTemplate[i]); - - vector q = findQav(win, windowSizesTemplate[i]); - - float alpha = getCoef(obsi, q); - - vector exp = calcExpected(q, alpha); - - float de = calcDE(obsi, exp); - - float dist = calcDist(query, subject, front, back); - - dist = ceil(dist); - - //dist-1 because vector indexes start at 0. - quan[dist-1].push_back(de); - - } - } - - return quan; - - } - catch(exception& e) { - errorOut(e, "Pintail", "findQav"); - exit(1); - } -} - -//*************************************************************************************************************** -float Pintail::getCoef(vector obs, vector qav) { - try { - - //find average prob for this seqs windows - float probAverage = 0.0; - for (int j = 0; j < qav.size(); j++) { probAverage += qav[j]; } - probAverage = probAverage / (float) qav.size(); - - //find observed average - float obsAverage = 0.0; - for (int j = 0; j < obs.size(); j++) { obsAverage += obs[j]; } - obsAverage = obsAverage / (float) obs.size(); - - - float coef = obsAverage / probAverage; - - return coef; - } - catch(exception& e) { - errorOut(e, "Pintail", "getCoef"); - exit(1); - } -} /**************************************************************************************************/ void Pintail::createProcessesSpots() { @@ -751,7 +432,6 @@ void Pintail::createProcessesSpots() { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; vector processIDS; - vector< vector > win; win.resize(querySeqs.size()); //loop through and create all the processes you want while (process != processors) { @@ -763,25 +443,56 @@ void Pintail::createProcessesSpots() { }else if (pid == 0){ mothurOut("Finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); - vector tempbest; - tempbest = findPairs(lines[process]->start, lines[process]->end); - int count = 0; - for (int i = lines[process]->start; i < lines[process]->end; i++) { - bestfit[i] = tempbest[count]; - + bestfit = findPairs(lines[process]->start, lines[process]->end); + mothurOut("Done finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); + + int count = lines[process]->start; + for (int j = 0; j < bestfit.size(); j++) { + //chops off beginning and end of sequences so they both start and end with a base - trimSeqs(querySeqs[i], bestfit[i], trimmed[i]); + map trim; + decalc->trimSeqs(querySeqs[count], bestfit[j], trim); + trimmed[count] = trim; + count++; } - mothurOut("Done finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); - + mothurOut("Finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); for (int i = lines[process]->start; i < lines[process]->end; i++) { - vector temp = findWindows(querySeqs[i], it->first, it->second, windowSizes[i]); - win[i] = temp; + it = trimmed[i].begin(); + windowsForeachQuery[i] = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment); } mothurOut("Done finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); + //write out data to file so parent can read it + ofstream out; + string s = toString(pid) + ".temp"; + openOutputFile(s, out); + + //output range and size + out << bestfit.size() << endl; + + //output pairs + for (int i = 0; i < bestfit.size(); i++) { + out << ">" << bestfit[i].getName() << endl << bestfit[i].getAligned() << endl; + } + + //output windowsForeachQuery + for (int i = 0; i < windowsForeachQuery.size(); i++) { + out << windowsForeachQuery[i].size() << '\t'; + for (int j = 0; j < windowsForeachQuery[i].size(); j++) { + out << windowsForeachQuery[i][j] << '\t'; + } + out << endl; + } + + //output windowSizes + for (int i = 0; i < windowSizes.size(); i++) { + out << windowSizes[i] << '\t'; + } + out << endl; + out.close(); + exit(0); }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } } @@ -792,17 +503,67 @@ void Pintail::createProcessesSpots() { wait(&temp); } - windowsForeachQuery = win; + //get data created by processes + for (int i=0;i> size; gobble(in); + + //get pairs + int count = lines[i]->start; + for (int m = 0; m < size; m++) { + Sequence temp(in); + bestfit[count] = temp; + + count++; + gobble(in); + } + + gobble(in); + + count = lines[i]->start; + for (int m = 0; m < size; m++) { + int num; + in >> num; + + vector win; int w; + for (int j = 0; j < num; j++) { + in >> w; + win.push_back(w); + } + + windowsForeachQuery[count] = win; + count++; + gobble(in); + } + + gobble(in); + count = lines[i]->start; + for (int i = 0; i < size; i++) { + int num; + in >> num; + + windowSizes[count] = num; + count++; + } + + in.close(); + } + + #else bestfit = findPairs(lines[0]->start, lines[0]->end); for (int j = 0; j < bestfit.size(); j++) { //chops off beginning and end of sequences so they both start and end with a base - trimSeqs(querySeqs[j], bestfit[j], j); + decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]); } for (int i = lines[0]->start; i < lines[0]->end; i++) { it = trimmed[i].begin(); - map win = findWindows(querySeqs[i], it->first, it->second, windowSizes[i]); + map win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment); windows[i] = win; } @@ -841,25 +602,25 @@ void Pintail::createProcesses() { mothurOut("Calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); for (int i = lines[process]->start; i < lines[process]->end; i++) { - vector obsi = calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]); + vector obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]); obs[i] = obsi; //calc Qav - vector q = findQav(windowsForeachQuery[i], windowSizes[i]); + vector q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile); //get alpha - float alpha = getCoef(obsDistance[i], q); + float alpha = decalc->getCoef(obsDistance[i], q); //find expected - vector exp = calcExpected(q, alpha); + vector exp = decalc->calcExpected(q, alpha); expectedDistance[i] = exp; //get de and deviation - float dei = calcDE(obsi, exp); + float dei = decalc->calcDE(obsi, exp); de[i] = dei; it = trimmed[i].begin(); - float dist = calcDist(querySeqs[i], bestfit[i], it->first, it->second); + float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); dev[i] = dist; } mothurOut("Done calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); @@ -882,7 +643,7 @@ void Pintail::createProcesses() { #else mothurOut("Calculating observed distance... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { - vector obsi = calcObserved(querySeqs[i], bestfit[i], windows[i], windowSizes[i]); + vector obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windows[i], windowSizes[i]); obsDistance[i] = obsi; } mothurOut("Done."); mothurOutEndLine(); @@ -891,7 +652,7 @@ void Pintail::createProcesses() { mothurOut("Finding variability... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { - vector q = findQav(windows[i], windowSizes[i]); + vector q = decalc->findQav(windows[i], windowSizes[i], probabilityProfile, h[i]); Qav[i] = q; } mothurOut("Done."); mothurOutEndLine(); @@ -900,7 +661,7 @@ void Pintail::createProcesses() { mothurOut("Calculating alpha... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { - float alpha = getCoef(obsDistance[i], Qav[i]); + float alpha = decalc->getCoef(obsDistance[i], Qav[i]); seqCoef.push_back(alpha); } mothurOut("Done."); mothurOutEndLine(); @@ -909,7 +670,7 @@ void Pintail::createProcesses() { mothurOut("Calculating expected distance... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { - vector exp = calcExpected(Qav[i], seqCoef[i]); + vector exp = decalc->calcExpected(Qav[i], seqCoef[i]); expectedDistance[i] = exp; } mothurOut("Done."); mothurOutEndLine(); @@ -918,11 +679,11 @@ void Pintail::createProcesses() { mothurOut("Finding deviation... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { - float de = calcDE(obsDistance[i], expectedDistance[i]); + float de = decalc->calcDE(obsDistance[i], expectedDistance[i]); DE[i] = de; it = trimmed[i].begin(); - float dist = calcDist(querySeqs[i], bestfit[i], it->first, it->second); + float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); deviation[i] = dist; } mothurOut("Done."); mothurOutEndLine(); @@ -954,7 +715,7 @@ void Pintail::createProcessesQuan() { process++; }else if (pid == 0){ - vector< vector > q = getQuantiles(templateLines[process]->start, templateLines[process]->end); + vector< vector > q = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end); for (int i = 0; i < q.size(); i++) { //put all values of q[i] into quan[i] @@ -979,7 +740,7 @@ void Pintail::createProcessesQuan() { quantiles = quan; #else - quantiles = getQuantiles(0, templateSeqs.size()); + quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); #endif } catch(exception& e) { diff --git a/pintail.h b/pintail.h index 8c0dce4..8afc754 100644 --- a/pintail.h +++ b/pintail.h @@ -12,6 +12,7 @@ #include "chimera.h" #include "dist.h" +#include "decalc.h" //This class was created using the algorythms described in the // "At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies" paper @@ -42,9 +43,11 @@ class Pintail : public Chimera { }; Dist* distcalculator; + DeCalculator* decalc; int iters; string fastafile, templateFile, consfile, quanfile; + vector lines; vector templateLines; vector querySeqs; @@ -69,22 +72,13 @@ class Pintail : public Chimera { vector DE; //DE[0] is the deviaation for queryseqs[0]... vector probabilityProfile; vector< vector > quantiles; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2... + vector< set > h; + - void trimSeqs(Sequence*, Sequence, map&); vector readFreq(); vector< vector > readQuantiles(); - vector< vector > getQuantiles(int, int); - vector calcFreq(vector); - vector findPairs(int, int); - vector findWindows(Sequence*, int, int, int&); - vector calcObserved(Sequence*, Sequence, vector, int); - vector calcExpected(vector, float); - vector findQav(vector, int); - float calcDE(vector, vector); - float calcDist(Sequence*, Sequence, int, int); - float getCoef(vector, vector); - + void createProcessesSpots(); void createProcesses(); void createProcessesQuan(); -- 2.39.2