X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=preclustercommand.cpp;h=66fd8500e3c7d583604106e0408e15bc36dc4a92;hb=5b9b3e01150055e3b4bb1a911ea4c61d0b755e89;hp=479a862935430545757ef48c6da660913782cfa2;hpb=832d53a9dfac6b1795735eec643d8cf627b0d8e3;p=mothur.git diff --git a/preclustercommand.cpp b/preclustercommand.cpp index 479a862..66fd850 100644 --- a/preclustercommand.cpp +++ b/preclustercommand.cpp @@ -43,9 +43,10 @@ PreClusterCommand::PreClusterCommand(string option){ //check for optional parameter and set defaults // ...at some point should added some additional type checking... namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not found") { namefile = ""; } else if (namefile == "not open") { abort = true; } - else { readNameFile(); } +// else { readNameFile(); } string temp = validParameter.validFile(parameters, "diffs", false); if(temp == "not found"){ temp = "1"; } convert(temp, diffs); @@ -87,68 +88,62 @@ int PreClusterCommand::execute(){ if (abort == true) { return 0; } //reads fasta file and return number of seqs - int numSeqs = readSeqs(); //fills alignSeqs and makes all seqs active + int numSeqs = readNamesFASTA(); //fills alignSeqs and makes all seqs active if (numSeqs == 0) { mothurOut("Error reading fasta file...please correct."); mothurOutEndLine(); return 0; } if (diffs > length) { mothurOut("Error: diffs is greater than your sequence length."); mothurOutEndLine(); return 0; } //clear sizes since you only needed this info to build the alignSeqs seqPNode structs - sizes.clear(); +// sizes.clear(); //sort seqs by number of identical seqs sort(alignSeqs.begin(), alignSeqs.end(), comparePriority); //go through active list and cluster everthing you can, until all nodes are inactive //taking advantage of the fact that maps are already sorted - map::iterator itActive; - map::iterator it2Active; +// map::iterator itActive; +// map::iterator it2Active; int count = 0; - for (int i = 0; i < alignSeqs.size(); i++) { - - //are you active - itActive = active.find(alignSeqs[i].seq.getName()); + for (int i = 0; i < numSeqs; i++) { - if (itActive != active.end()) { //this sequence has not been merged yet + //are you active + // itActive = active.find(alignSeqs[i].seq.getName()); - //try to merge it with all smaller seqs - for (int j = i; j < alignSeqs.size(); j++) { - - if (i != j) { - //are you active - it2Active = active.find(alignSeqs[j].seq.getName()); - if (it2Active != active.end()) { //this sequence has not been merged yet - //are you within "diff" bases - int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned()); - - if (mismatch <= diffs) { - //merge - names[alignSeqs[i].seq.getName()] += "," + names[alignSeqs[j].seq.getName()]; - - //remove from active list - active.erase(it2Active); - - //set numIdentical to 0, so you only print the representative seqs in the fasta file - alignSeqs[j].numIdentical = 0; - count++; - } - }//end if j active - }//end if i != j - }//end for loop + if (alignSeqs[i].active) { //this sequence has not been merged yet - //remove from active list - active.erase(itActive); + //try to merge it with all smaller seqs + for (int j = i+1; j < numSeqs; j++) { + if (alignSeqs[j].active) { //this sequence has not been merged yet + //are you within "diff" bases + int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned()); + + if (mismatch <= diffs) { + //merge + alignSeqs[i].names += ',' + alignSeqs[j].names; + alignSeqs[i].numIdentical += alignSeqs[j].numIdentical; + + alignSeqs[j].active = 0; + alignSeqs[j].numIdentical = 0; + count++; + } + }//end if j active + }//end if i != j + + //remove from active list + alignSeqs[i].active = 0; }//end if active i + if(i % 100 == 0) { cout << i << '\t' << numSeqs - count << '\t' << count << endl; } } string newFastaFile = getRootName(fastafile) + "precluster" + getExtension(fastafile); string newNamesFile = getRootName(fastafile) + "precluster.names"; - printData(newFastaFile, newNamesFile); mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); mothurOutEndLine(); mothurOut("pre.cluster removed " + toString(count) + " sequences."); mothurOutEndLine(); - + printData(newFastaFile, newNamesFile); + return 0; } @@ -158,65 +153,85 @@ int PreClusterCommand::execute(){ } } /**************************************************************************************************/ -int PreClusterCommand::readSeqs(){ +int PreClusterCommand::readFASTA(){ try { - ifstream inFasta; - openInputFile(fastafile, inFasta); - length = 0; - map::iterator it; - - while (!inFasta.eof()) { - Sequence temp(inFasta); //read seq - - if (temp.getName() != "") { - if (namefile != "") { - //make sure fasta and name files match - it = names.find(temp.getName()); - if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); } - }else { sizes[temp.getName()] = 1; } - - seqPNode tempNode(sizes[temp.getName()], temp); - alignSeqs.push_back(tempNode); - active[temp.getName()] = true; - } - gobble(inFasta); - } - inFasta.close(); - - if (alignSeqs.size() != 0) { length = alignSeqs[0].seq.getAligned().length(); } - +// ifstream inFasta; +// openInputFile(fastafile, inFasta); +// length = 0; +//// map::iterator it; +// +// while (!inFasta.eof()) { +// Sequence temp(inFasta); //read seq +// +// if (temp.getName() != "") { +// if (namefile != "") { +// //make sure fasta and name files match +// it = names.find(temp.getName()); +// if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); } +// }else { sizes[temp.getName()] = 1; } +// +// seqPNode tempNode(sizes[temp.getName()], temp); +// alignSeqs.push_back(tempNode); +// active[temp.getName()] = true; +// } +// gobble(inFasta); +// } +// inFasta.close(); +// +// if (alignSeqs.size() != 0) { length = alignSeqs[0].seq.getAligned().length(); } +// return alignSeqs.size(); } catch(exception& e) { - errorOut(e, "PreClusterCommand", "readSeqs"); + errorOut(e, "PreClusterCommand", "readFASTA"); exit(1); } } /**************************************************************************************************/ -void PreClusterCommand::readNameFile(){ + +int PreClusterCommand::readNamesFASTA(){ try { - ifstream in; - openInputFile(namefile, in); - string firstCol, secondCol; - - while (!in.eof()) { - in >> firstCol >> secondCol; gobble(in); - names[firstCol] = secondCol; + ifstream inNames; + ifstream inFasta; + + openInputFile(namefile, inNames); + openInputFile(fastafile, inFasta); + + string firstCol, secondCol, nameString; + length = 0; + + while (inFasta && inNames) { + + inNames >> firstCol >> secondCol; + nameString = secondCol; + + gobble(inNames); int size = 1; while (secondCol.find_first_of(',') != -1) { size++; secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length()); } - sizes[firstCol] = size; + Sequence seq(inFasta); + if (seq.getName() != firstCol) { mothurOut(seq.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); } + else{ + seqPNode tempNode(size, seq, nameString); + alignSeqs.push_back(tempNode); + if (seq.getAligned().length() > length) { length = alignSeqs[0].seq.getAligned().length(); } + } } - in.close(); + inFasta.close(); + inNames.close(); + return alignSeqs.size(); } + catch(exception& e) { - errorOut(e, "PreClusterCommand", "readNameFile"); + errorOut(e, "PreClusterCommand", "readNamesFASTA"); exit(1); } } + /**************************************************************************************************/ + int PreClusterCommand::calcMisMatches(string seq1, string seq2){ try { int numBad = 0; @@ -234,23 +249,22 @@ int PreClusterCommand::calcMisMatches(string seq1, string seq2){ exit(1); } } + /**************************************************************************************************/ + void PreClusterCommand::printData(string newfasta, string newname){ try { ofstream outFasta; ofstream outNames; + openOutputFile(newfasta, outFasta); openOutputFile(newname, outNames); - map::iterator itNames; for (int i = 0; i < alignSeqs.size(); i++) { if (alignSeqs[i].numIdentical != 0) { alignSeqs[i].seq.printSequence(outFasta); - - itNames = names.find(alignSeqs[i].seq.getName()); - - outNames << itNames->first << '\t' << itNames->second << endl; + outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; } }