X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=preclustercommand.cpp;h=a7f211345697a72c18b6fa0aaa1ac40200c46d94;hb=bd93b1a6f9fe9a6a4a7ac2e9f106e5c83a438856;hp=0275648066b99c55533e4aaacdcde5bb21edad5f;hpb=51d85e945d26324e50d6406829ce006ccc9eb1c0;p=mothur.git diff --git a/preclustercommand.cpp b/preclustercommand.cpp index 0275648..a7f2113 100644 --- a/preclustercommand.cpp +++ b/preclustercommand.cpp @@ -44,7 +44,7 @@ PreClusterCommand::PreClusterCommand(string option) { it = parameters.find("fasta"); //user has given a template file if(it != parameters.end()){ - path = hasPath(it->second); + path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["fasta"] = inputDir + it->second; } } @@ -52,7 +52,7 @@ PreClusterCommand::PreClusterCommand(string option) { it = parameters.find("name"); //user has given a template file if(it != parameters.end()){ - path = hasPath(it->second); + path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } @@ -66,7 +66,7 @@ PreClusterCommand::PreClusterCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; - outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it + outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it } //check for optional parameter and set defaults @@ -115,6 +115,8 @@ int PreClusterCommand::execute(){ if (abort == true) { return 0; } + int start = time(NULL); + //reads fasta file and return number of seqs int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active @@ -123,66 +125,65 @@ int PreClusterCommand::execute(){ if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0; } if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0; } - string fileroot = outputDir + getRootName(getSimpleName(fastafile)); - string newFastaFile = fileroot + "precluster" + getExtension(fastafile); - string newNamesFile = fileroot + "precluster.names"; - ofstream outFasta; - ofstream outNames; - - openOutputFile(newFastaFile, outFasta); - openOutputFile(newNamesFile, outNames); - + //clear sizes since you only needed this info to build the alignSeqs seqPNode structs +// sizes.clear(); + //sort seqs by number of identical seqs - alignSeqs.sort(comparePriority); + sort(alignSeqs.begin(), alignSeqs.end(), comparePriority); int count = 0; - int i = 0; //think about running through twice... - list::iterator itList; - list::iterator itList2; - for (itList = alignSeqs.begin(); itList != alignSeqs.end(); itList++) { - //try to merge it with all smaller seqs - for (itList2 = alignSeqs.begin(); itList2 != alignSeqs.end(); itList2++) { + for (int i = 0; i < numSeqs; i++) { + + //are you active + // itActive = active.find(alignSeqs[i].seq.getName()); + + if (alignSeqs[i].active) { //this sequence has not been merged yet + + //try to merge it with all smaller seqs + for (int j = i+1; j < numSeqs; j++) { - if (m->control_pressed) { outFasta.close(); outNames.close(); remove(newFastaFile.c_str()); remove(newNamesFile.c_str()); return 0; } + if (m->control_pressed) { return 0; } - if ((*itList).seq.getName() != (*itList2).seq.getName()) { //you don't want to merge with yourself - //are you within "diff" bases - int mismatch = calcMisMatches((*itList).seq.getAligned(), (*itList2).seq.getAligned()); + if (alignSeqs[j].active) { //this sequence has not been merged yet + //are you within "diff" bases + int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned()); - if (mismatch <= diffs) { - //merge - (*itList).names += ',' + (*itList2).names; - (*itList).numIdentical += (*itList2).numIdentical; + if (mismatch <= diffs) { + //merge + alignSeqs[i].names += ',' + alignSeqs[j].names; + alignSeqs[i].numIdentical += alignSeqs[j].numIdentical; - alignSeqs.erase(itList2); itList2--; - count++; - } - } - } - - //ouptut this sequence - printData(outFasta, outNames, (*itList)); - - //remove sequence - alignSeqs.erase(itList); itList--; - - i++; + alignSeqs[j].active = 0; + alignSeqs[j].numIdentical = 0; + count++; + } + }//end if j active + }//end if i != j + //remove from active list + alignSeqs[i].active = 0; + + }//end if active i if(i % 100 == 0) { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); } } - - if(i % 100 != 0) { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); } - outFasta.close(); - outNames.close(); + if(numSeqs % 100 != 0) { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); } + - if (m->control_pressed) { remove(newFastaFile.c_str()); remove(newNamesFile.c_str()); return 0; } - + string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile)); + + string newFastaFile = fileroot + "precluster" + m->getExtension(fastafile); + string newNamesFile = fileroot + "precluster.names"; + + if (m->control_pressed) { return 0; } + + m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine(); + m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); + printData(newFastaFile, newNamesFile); - m->mothurOut("Total number of sequences before precluster was " + toString(numSeqs) + "."); m->mothurOutEndLine(); - m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); if (m->control_pressed) { remove(newFastaFile.c_str()); remove(newNamesFile.c_str()); return 0; } @@ -210,8 +211,8 @@ int PreClusterCommand::readFASTA(){ //ifstream inNames; ifstream inFasta; - //openInputFile(namefile, inNames); - openInputFile(fastafile, inFasta); + //m->openInputFile(namefile, inNames); + m->openInputFile(fastafile, inFasta); //string firstCol, secondCol, nameString; length = 0; @@ -223,14 +224,14 @@ int PreClusterCommand::readFASTA(){ //inNames >> firstCol >> secondCol; //nameString = secondCol; - //gobble(inNames); + //m->gobble(inNames); //int size = 1; //while (secondCol.find_first_of(',') != -1) { // size++; // secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length()); //} - Sequence seq(inFasta); gobble(inFasta); + Sequence seq(inFasta); m->gobble(inFasta); if (seq.getName() != "") { //can get "" if commented line is at end of fasta file if (namefile != "") { @@ -282,10 +283,25 @@ int PreClusterCommand::calcMisMatches(string seq1, string seq2){ /**************************************************************************************************/ -void PreClusterCommand::printData(ofstream& outFasta, ofstream& outNames, seqPNode thisSeq){ +void PreClusterCommand::printData(string newfasta, string newname){ try { - thisSeq.seq.printSequence(outFasta); - outNames << thisSeq.seq.getName() << '\t' << thisSeq.names << endl; + ofstream outFasta; + ofstream outNames; + + m->openOutputFile(newfasta, outFasta); + m->openOutputFile(newname, outNames); + + + for (int i = 0; i < alignSeqs.size(); i++) { + if (alignSeqs[i].numIdentical != 0) { + alignSeqs[i].seq.printSequence(outFasta); + outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; + } + } + + outFasta.close(); + outNames.close(); + } catch(exception& e) { m->errorOut(e, "PreClusterCommand", "printData"); @@ -297,16 +313,16 @@ void PreClusterCommand::printData(ofstream& outFasta, ofstream& outNames, seqPNo void PreClusterCommand::readNameFile(){ try { ifstream in; - openInputFile(namefile, in); + m->openInputFile(namefile, in); string firstCol, secondCol; while (!in.eof()) { - in >> firstCol >> secondCol; gobble(in); + in >> firstCol >> secondCol; m->gobble(in); names[firstCol] = secondCol; int size = 1; - while (secondCol.find_first_of(',') != -1) { - size++; - secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length()); + + for(int i=0;i