X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=chimeraperseuscommand.h;h=b6d4fc9323f659abe52cd70a1e3598a84d9db936;hp=01f5768a0c070c70aa6c7264b6d482b3636d7148;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=43ed0accfbc2852849e104ff7eccdd2c42acd4ec diff --git a/chimeraperseuscommand.h b/chimeraperseuscommand.h index 01f5768..b6d4fc9 100644 --- a/chimeraperseuscommand.h +++ b/chimeraperseuscommand.h @@ -16,7 +16,9 @@ #include "mothur.h" #include "command.hpp" #include "sequenceparser.h" +#include "sequencecountparser.h" #include "myPerseus.h" +#include "counttable.h" /***********************************************************/ class ChimeraPerseusCommand : public Command { @@ -28,8 +30,10 @@ public: vector setParameters(); string getCommandName() { return "chimera.perseus"; } string getCommandCategory() { return "Sequence Processing"; } + string getHelpString(); - string getCitation() { return "http://www.mothur.org/wiki/Chimera.perseus\n"; } + string getOutputPattern(string); + string getCitation() { return "Quince C, Lanzen A, Davenport RJ, Turnbaugh PJ (2011). Removing noise from pyrosequenced amplicons. BMC Bioinformatics 12:38.\nEdgar,R.C., Haas,B.J., Clemente,J.C., Quince,C. and Knight,R. (2011), UCHIME improves sensitivity and speed of chimera detection. Bioinformatics 27:2194.\nhttp://www.mothur.org/wiki/Chimera.perseus\n"; } string getDescription() { return "detect chimeric sequences"; } int execute(); @@ -42,10 +46,12 @@ private: linePair(int i, int j) : start(i), end(j) {} }; - bool abort; - string fastafile, groupfile, outputDir, namefile; + bool abort, hasName, hasCount, dups; + string fastafile, groupfile, countfile, outputDir, namefile; int processors, alignLength; double cutoff, alpha, beta; + SequenceParser* parser; + SequenceCountParser* cparser; vector outputNames; vector fastaFileNames; @@ -55,10 +61,12 @@ private: string getNamesFile(string&); int driver(string, vector&, string, int&); vector readFiles(string, string); - vector loadSequences(SequenceParser&, string); - int deconvoluteResults(SequenceParser&, string, string); - int driverGroups(SequenceParser&, string, string, int, int, vector); - int createProcessesGroups(SequenceParser&, string, string, vector, string, string, string); + vector readFiles(string inputFile, CountTable* ct); + vector loadSequences(string); + int deconvoluteResults(map&, string, string); + int driverGroups(string, string, string, int, int, vector); + int createProcessesGroups(string, string, string, vector, string, string, string); + string removeNs(string); }; /**************************************************************************************************/ @@ -71,15 +79,17 @@ struct perseusData { string groupfile; string outputFName; string accnos; + string countlist; MothurOut* m; int start; int end; + bool hasName, hasCount, dups; int threadID, count, numChimeras; double alpha, beta, cutoff; vector groups; perseusData(){} - perseusData(double a, double b, double c, string o, string f, string n, string g, string ac, vector gr, MothurOut* mout, int st, int en, int tid) { + perseusData(bool dps, bool hn, bool hc, double a, double b, double c, string o, string f, string n, string g, string ac, string ctlist, vector gr, MothurOut* mout, int st, int en, int tid) { alpha = a; beta = b; cutoff = c; @@ -87,12 +97,16 @@ struct perseusData { namefile = n; groupfile = g; outputFName = o; + countlist = ctlist; accnos = ac; m = mout; start = st; end = en; threadID = tid; groups = gr; + hasName = hn; + hasCount = hc; + dups = dps; count = 0; numChimeras = 0; } @@ -113,38 +127,80 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ //parse fasta and name file by group SequenceParser* parser; - if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } - else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } - + SequenceCountParser* cparser; + if (pDataArray->hasCount) { + CountTable* ct = new CountTable(); + ct->readTable(pDataArray->namefile, true); + cparser = new SequenceCountParser(pDataArray->fastafile, *ct); + delete ct; + }else { + if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } + else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } + } + int totalSeqs = 0; int numChimeras = 0; + + ofstream outCountList; + if (pDataArray->hasCount && pDataArray->dups) { pDataArray->m->openOutputFile(pDataArray->countlist, outCountList); } - for (int i = pDataArray->start; i < pDataArray->end; i++) { + for (int u = pDataArray->start; u < pDataArray->end; u++) { - int start = time(NULL); if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + int start = time(NULL); if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } - pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[i] + "..."); pDataArray->m->mothurOutEndLine(); + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[u] + "..."); pDataArray->m->mothurOutEndLine(); //vector sequences = loadSequences(parser, groups[i]); - same function below //////////////////////////////////////////////////////////////////////////////////////// - vector thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]); - map nameMap = parser->getNameMap(pDataArray->groups[i]); - map::iterator it; - - vector sequences; bool error = false; - - for (int j = 0; j < thisGroupsSeqs.size(); j++) { - - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } - - it = nameMap.find(thisGroupsSeqs[j].getName()); - if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[j].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } - else { - int num = pDataArray->m->getNumNames(it->second); - sequences.push_back(seqData(thisGroupsSeqs[j].getName(), thisGroupsSeqs[j].getUnaligned(), num)); - } - } + int alignLength = 0; + vector sequences; + if (pDataArray->hasCount) { + vector thisGroupsSeqs = cparser->getSeqs(pDataArray->groups[u]); + map counts = cparser->getCountTable(pDataArray->groups[u]); + map::iterator it; + + for (int i = 0; i < thisGroupsSeqs.size(); i++) { + + if (pDataArray->m->control_pressed) { break; } + + it = counts.find(thisGroupsSeqs[i].getName()); + if (it == counts.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); } + else { + string newSeq = ""; + string tempSeq = thisGroupsSeqs[i].getUnaligned(); + for (int j = 0; j < tempSeq.length(); j++) { if (tempSeq[j] != 'N') { newSeq += tempSeq[j]; } } + thisGroupsSeqs[i].setAligned(newSeq); + + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second)); + if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } + } + } + }else{ + vector thisGroupsSeqs = parser->getSeqs(pDataArray->groups[u]); + map nameMap = parser->getNameMap(pDataArray->groups[u]); + map::iterator it; + + for (int i = 0; i < thisGroupsSeqs.size(); i++) { + + if (pDataArray->m->control_pressed) { break; } + + it = nameMap.find(thisGroupsSeqs[i].getName()); + if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + else { + int num = pDataArray->m->getNumNames(it->second); + string newSeq = ""; + string tempSeq = thisGroupsSeqs[i].getUnaligned(); + for (int j = 0; j < tempSeq.length(); j++) { if (tempSeq[j] != 'N') { newSeq += tempSeq[j]; } } + thisGroupsSeqs[i].setAligned(newSeq); + + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num)); + if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } + } + } + + } + if (error) { pDataArray->m->control_pressed = true; } @@ -152,12 +208,12 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ sort(sequences.rbegin(), sequences.rend()); //////////////////////////////////////////////////////////////////////////////////////// - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below //////////////////////////////////////////////////////////////////////////////////////// - string chimeraFileName = pDataArray->outputFName+pDataArray->groups[i]; - string accnosFileName = pDataArray->accnos+pDataArray->groups[i]; + string chimeraFileName = pDataArray->outputFName+pDataArray->groups[u]; + string accnosFileName = pDataArray->accnos+pDataArray->groups[u]; vector > correctModel(4); //could be an option in the future to input own model matrix for(int j=0;j<4;j++){ correctModel[j].resize(4); } @@ -183,7 +239,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ } int numSeqs = sequences.size(); - int alignLength = sequences[0].sequence.size(); + //int alignLength = sequences[0].sequence.size(); ofstream chimeraFile; ofstream accnosFile; @@ -199,7 +255,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ for(int j=0;jm->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } vector restricted = chimeras; @@ -216,7 +272,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi; @@ -225,7 +281,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ if(comparisons >= 2){ minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } int minMismatchToTrimera = numeric_limits::max(); int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB; @@ -233,12 +289,12 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ if(minMismatchToChimera >= 3 && comparisons >= 3){ minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } } double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } string type; string chimeraRefSeq; @@ -252,16 +308,16 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps); } - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq); double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t'; chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t'; @@ -287,27 +343,62 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ chimeraFile << j << '\t' << sequences[j].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl; } //report progress - if((j+1) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(j+1) + "\n"); } + if((j+1) % 100 == 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(j+1) + "\n"); } } - if((numSeqs) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n"); } + if((numSeqs) % 100 != 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(numSeqs) + "\n"); } chimeraFile.close(); accnosFile.close(); //////////////////////////////////////////////////////////////////////////////////////// totalSeqs += numSeqs; + + if (pDataArray->dups) { + if (!pDataArray->m->isBlank(accnosFileName)) { + ifstream in; + pDataArray->m->openInputFile(accnosFileName, in); + string name; + if (pDataArray->hasCount) { + while (!in.eof()) { + in >> name; pDataArray->m->gobble(in); + outCountList << name << '\t' << pDataArray->groups[u] << endl; + } + in.close(); + }else { + map thisnamemap = parser->getNameMap(pDataArray->groups[u]); + map::iterator itN; + ofstream out; + pDataArray->m->openOutputFile(accnosFileName+".temp", out); + while (!in.eof()) { + in >> name; pDataArray->m->gobble(in); + itN = thisnamemap.find(name); + if (itN != thisnamemap.end()) { + vector tempNames; pDataArray->m->splitAtComma(itN->second, tempNames); + for (int j = 0; j < tempNames.size(); j++) { out << tempNames[j] << endl; } + + }else { pDataArray->m->mothurOut("[ERROR]: parsing cannot find " + name + ".\n"); pDataArray->m->control_pressed = true; } + } + out.close(); + in.close(); + pDataArray->m->renameFile(accnosFileName+".temp", accnosFileName); + } + + } + } //append files pDataArray->m->appendFiles(chimeraFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(chimeraFileName); pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName); - pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[i] + "."); pDataArray->m->mothurOutEndLine(); + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[u] + "."); pDataArray->m->mothurOutEndLine(); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } } + if (pDataArray->hasCount && pDataArray->dups) { outCountList.close(); } + pDataArray->count = totalSeqs; - delete parser; + if (pDataArray->hasCount) { delete cparser; } { delete parser; } return totalSeqs; }