X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=chimeraperseuscommand.h;h=b6d4fc9323f659abe52cd70a1e3598a84d9db936;hp=e2855d019ff984d9ac026a2554860c5b3aa33f42;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=5c5c0428f6d548c28a8b903ac80efed4f92d59db diff --git a/chimeraperseuscommand.h b/chimeraperseuscommand.h index e2855d0..b6d4fc9 100644 --- a/chimeraperseuscommand.h +++ b/chimeraperseuscommand.h @@ -30,8 +30,9 @@ public: vector setParameters(); string getCommandName() { return "chimera.perseus"; } string getCommandCategory() { return "Sequence Processing"; } - string getOutputFileNameTag(string, string); + string getHelpString(); + string getOutputPattern(string); string getCitation() { return "Quince C, Lanzen A, Davenport RJ, Turnbaugh PJ (2011). Removing noise from pyrosequenced amplicons. BMC Bioinformatics 12:38.\nEdgar,R.C., Haas,B.J., Clemente,J.C., Quince,C. and Knight,R. (2011), UCHIME improves sensitivity and speed of chimera detection. Bioinformatics 27:2194.\nhttp://www.mothur.org/wiki/Chimera.perseus\n"; } string getDescription() { return "detect chimeric sequences"; } @@ -45,7 +46,7 @@ private: linePair(int i, int j) : start(i), end(j) {} }; - bool abort, hasName, hasCount; + bool abort, hasName, hasCount, dups; string fastafile, groupfile, countfile, outputDir, namefile; int processors, alignLength; double cutoff, alpha, beta; @@ -63,8 +64,9 @@ private: vector readFiles(string inputFile, CountTable* ct); vector loadSequences(string); int deconvoluteResults(map&, string, string); - int driverGroups(string, string, int, int, vector); - int createProcessesGroups(string, string, vector, string, string, string); + int driverGroups(string, string, string, int, int, vector); + int createProcessesGroups(string, string, string, vector, string, string, string); + string removeNs(string); }; /**************************************************************************************************/ @@ -77,16 +79,17 @@ struct perseusData { string groupfile; string outputFName; string accnos; + string countlist; MothurOut* m; int start; int end; - bool hasName, hasCount; + bool hasName, hasCount, dups; int threadID, count, numChimeras; double alpha, beta, cutoff; vector groups; perseusData(){} - perseusData(bool hn, bool hc, double a, double b, double c, string o, string f, string n, string g, string ac, vector gr, MothurOut* mout, int st, int en, int tid) { + perseusData(bool dps, bool hn, bool hc, double a, double b, double c, string o, string f, string n, string g, string ac, string ctlist, vector gr, MothurOut* mout, int st, int en, int tid) { alpha = a; beta = b; cutoff = c; @@ -94,6 +97,7 @@ struct perseusData { namefile = n; groupfile = g; outputFName = o; + countlist = ctlist; accnos = ac; m = mout; start = st; @@ -102,6 +106,7 @@ struct perseusData { groups = gr; hasName = hn; hasCount = hc; + dups = dps; count = 0; numChimeras = 0; } @@ -125,7 +130,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ SequenceCountParser* cparser; if (pDataArray->hasCount) { CountTable* ct = new CountTable(); - ct->readTable(pDataArray->namefile); + ct->readTable(pDataArray->namefile, true); cparser = new SequenceCountParser(pDataArray->fastafile, *ct); delete ct; }else { @@ -135,12 +140,15 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ int totalSeqs = 0; int numChimeras = 0; + + ofstream outCountList; + if (pDataArray->hasCount && pDataArray->dups) { pDataArray->m->openOutputFile(pDataArray->countlist, outCountList); } - for (int i = pDataArray->start; i < pDataArray->end; i++) { + for (int u = pDataArray->start; u < pDataArray->end; u++) { int start = time(NULL); if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } - pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[i] + "..."); pDataArray->m->mothurOutEndLine(); + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[u] + "..."); pDataArray->m->mothurOutEndLine(); //vector sequences = loadSequences(parser, groups[i]); - same function below //////////////////////////////////////////////////////////////////////////////////////// @@ -148,8 +156,8 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ int alignLength = 0; vector sequences; if (pDataArray->hasCount) { - vector thisGroupsSeqs = cparser->getSeqs(pDataArray->groups[i]); - map counts = cparser->getCountTable(pDataArray->groups[i]); + vector thisGroupsSeqs = cparser->getSeqs(pDataArray->groups[u]); + map counts = cparser->getCountTable(pDataArray->groups[u]); map::iterator it; for (int i = 0; i < thisGroupsSeqs.size(); i++) { @@ -159,13 +167,18 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ it = counts.find(thisGroupsSeqs[i].getName()); if (it == counts.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); } else { + string newSeq = ""; + string tempSeq = thisGroupsSeqs[i].getUnaligned(); + for (int j = 0; j < tempSeq.length(); j++) { if (tempSeq[j] != 'N') { newSeq += tempSeq[j]; } } + thisGroupsSeqs[i].setAligned(newSeq); + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second)); if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } } } }else{ - vector thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]); - map nameMap = parser->getNameMap(pDataArray->groups[i]); + vector thisGroupsSeqs = parser->getSeqs(pDataArray->groups[u]); + map nameMap = parser->getNameMap(pDataArray->groups[u]); map::iterator it; for (int i = 0; i < thisGroupsSeqs.size(); i++) { @@ -176,6 +189,11 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } else { int num = pDataArray->m->getNumNames(it->second); + string newSeq = ""; + string tempSeq = thisGroupsSeqs[i].getUnaligned(); + for (int j = 0; j < tempSeq.length(); j++) { if (tempSeq[j] != 'N') { newSeq += tempSeq[j]; } } + thisGroupsSeqs[i].setAligned(newSeq); + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num)); if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } } @@ -194,8 +212,8 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below //////////////////////////////////////////////////////////////////////////////////////// - string chimeraFileName = pDataArray->outputFName+pDataArray->groups[i]; - string accnosFileName = pDataArray->accnos+pDataArray->groups[i]; + string chimeraFileName = pDataArray->outputFName+pDataArray->groups[u]; + string accnosFileName = pDataArray->accnos+pDataArray->groups[u]; vector > correctModel(4); //could be an option in the future to input own model matrix for(int j=0;j<4;j++){ correctModel[j].resize(4); } @@ -325,25 +343,60 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ chimeraFile << j << '\t' << sequences[j].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl; } //report progress - if((j+1) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(j+1) + "\n"); } + if((j+1) % 100 == 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(j+1) + "\n"); } } - if((numSeqs) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n"); } + if((numSeqs) % 100 != 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(numSeqs) + "\n"); } chimeraFile.close(); accnosFile.close(); //////////////////////////////////////////////////////////////////////////////////////// totalSeqs += numSeqs; + + if (pDataArray->dups) { + if (!pDataArray->m->isBlank(accnosFileName)) { + ifstream in; + pDataArray->m->openInputFile(accnosFileName, in); + string name; + if (pDataArray->hasCount) { + while (!in.eof()) { + in >> name; pDataArray->m->gobble(in); + outCountList << name << '\t' << pDataArray->groups[u] << endl; + } + in.close(); + }else { + map thisnamemap = parser->getNameMap(pDataArray->groups[u]); + map::iterator itN; + ofstream out; + pDataArray->m->openOutputFile(accnosFileName+".temp", out); + while (!in.eof()) { + in >> name; pDataArray->m->gobble(in); + itN = thisnamemap.find(name); + if (itN != thisnamemap.end()) { + vector tempNames; pDataArray->m->splitAtComma(itN->second, tempNames); + for (int j = 0; j < tempNames.size(); j++) { out << tempNames[j] << endl; } + + }else { pDataArray->m->mothurOut("[ERROR]: parsing cannot find " + name + ".\n"); pDataArray->m->control_pressed = true; } + } + out.close(); + in.close(); + pDataArray->m->renameFile(accnosFileName+".temp", accnosFileName); + } + + } + } //append files pDataArray->m->appendFiles(chimeraFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(chimeraFileName); pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName); - pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[i] + "."); pDataArray->m->mothurOutEndLine(); + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[u] + "."); pDataArray->m->mothurOutEndLine(); if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } } + if (pDataArray->hasCount && pDataArray->dups) { outCountList.close(); } + pDataArray->count = totalSeqs; if (pDataArray->hasCount) { delete cparser; } { delete parser; } return totalSeqs;