X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=aligncommand.cpp;h=bcc59675d3b545f68197a3f56911a769ebe249bb;hb=fefd5ee1517abd3bc38b469cb2dffc85a1571c7e;hp=efc8ce489e79ac5e1d17084f5e952b39590894ba;hpb=90708fe9701e3827e477c82fb3652539c3bf2a0d;p=mothur.git diff --git a/aligncommand.cpp b/aligncommand.cpp index efc8ce4..bcc5967 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -20,21 +20,21 @@ //********************************************************************************************************************** vector AlignCommand::setParameters(){ try { - CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate); - CommandParameter pcandidate("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pcandidate); - CommandParameter psearch("search", "Multiple", "kmer-blast-suffix", "kmer", "", "", "",false,false); parameters.push_back(psearch); - CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize); - CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch); - CommandParameter palign("align", "Multiple", "needleman-gotoh-blast-noalign", "needleman", "", "", "",false,false); parameters.push_back(palign); - CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch); - CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen); - CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend); - CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip); - CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); - CommandParameter pthreshold("threshold", "Number", "", "0.50", "", "", "",false,false); parameters.push_back(pthreshold); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(ptemplate); + CommandParameter pcandidate("fasta", "InputTypes", "", "", "none", "none", "none","fasta-alignreport-accnos",false,true,true); parameters.push_back(pcandidate); + CommandParameter psearch("search", "Multiple", "kmer-blast-suffix", "kmer", "", "", "","",false,false,true); parameters.push_back(psearch); + CommandParameter pksize("ksize", "Number", "", "8", "", "", "","",false,false); parameters.push_back(pksize); + CommandParameter pmatch("match", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pmatch); + CommandParameter palign("align", "Multiple", "needleman-gotoh-blast-noalign", "needleman", "", "", "","",false,false,true); parameters.push_back(palign); + CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pmismatch); + CommandParameter pgapopen("gapopen", "Number", "", "-5.0", "", "", "","",false,false); parameters.push_back(pgapopen); + CommandParameter pgapextend("gapextend", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapextend); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pflip("flip", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pflip); + CommandParameter psave("save", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psave); + CommandParameter pthreshold("threshold", "Number", "", "0.50", "", "", "","",false,false); parameters.push_back(pthreshold); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -57,8 +57,8 @@ string AlignCommand::getHelpString(){ helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8."; helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0."; helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0."; - helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0."; - helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0."; + helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -5.0."; + helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0."; helpString += "The flip parameter is used to specify whether or not you want mothur to try the reverse complement if a sequence falls below the threshold. The default is false."; helpString += "The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. The default threshold is 0.50, meaning 50% of the bases are removed in the alignment."; helpString += "If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported."; @@ -76,28 +76,22 @@ string AlignCommand::getHelpString(){ } } //********************************************************************************************************************** -string AlignCommand::getOutputFileNameTag(string type, string inputName=""){ - try { - string tag = ""; - map >::iterator it; +string AlignCommand::getOutputPattern(string type) { + try { + string pattern = ""; - //is this a type this command creates - it = outputTypes.find(type); - if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } - else { - if (type == "fasta") { tag = "align"; } - else if (type == "alignreport") { tag = "align.report"; } - else if (type == "accnos") { tag = "flip.accnos"; } - else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } - } - return tag; - } - catch(exception& e) { - m->errorOut(e, "AlignCommand", "getOutputFileName"); - exit(1); - } + if (type == "fasta") { pattern = "[filename],align"; } //makes file like: amazon.align + else if (type == "alignreport") { pattern = "[filename],align.report"; } + else if (type == "accnos") { pattern = "[filename],flip.accnos"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "AlignCommand", "getOutputPattern"); + exit(1); + } } - //********************************************************************************************************************** AlignCommand::AlignCommand(){ try { @@ -116,7 +110,7 @@ AlignCommand::AlignCommand(){ //********************************************************************************************************************** AlignCommand::AlignCommand(string option) { try { - abort = false; calledHelp = false; + abort = false; calledHelp = false; ReferenceDB* rdb = ReferenceDB::getInstance(); //allow user to run help @@ -255,10 +249,10 @@ AlignCommand::AlignCommand(string option) { temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found"){ temp = "-1.0"; } m->mothurConvert(temp, misMatch); - temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-2.0"; } + temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-5.0"; } m->mothurConvert(temp, gapOpen); - temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-1.0"; } + temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-2.0"; } m->mothurConvert(temp, gapExtend); temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } @@ -328,9 +322,11 @@ int AlignCommand::execute(){ m->mothurOut("Aligning sequences from " + candidateFileNames[s] + " ..." ); m->mothurOutEndLine(); if (outputDir == "") { outputDir += m->hasPath(candidateFileNames[s]); } - string alignFileName = outputDir + m->getRootName(m->getSimpleName(candidateFileNames[s])) + getOutputFileNameTag("fasta"); - string reportFileName = outputDir + m->getRootName(m->getSimpleName(candidateFileNames[s])) + getOutputFileNameTag("alignreport"); - string accnosFileName = outputDir + m->getRootName(m->getSimpleName(candidateFileNames[s])) + getOutputFileNameTag("accnos"); + map variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(candidateFileNames[s])); + string alignFileName = getOutputFileName("fasta", variables); + string reportFileName = getOutputFileName("alignreport", variables); + string accnosFileName = getOutputFileName("accnos", variables); + bool hasAccnos = true; int numFastaSeqs = 0; @@ -547,6 +543,7 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam //moved this into driver to avoid deep copies in windows paralellized version Alignment* alignment; int longestBase = templateDB->getLongestBase(); + if (m->debug) { m->mothurOut("[DEBUG]: template longest base = " + toString(templateDB->getLongestBase()) + " \n"); } if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); } else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } @@ -569,11 +566,12 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam int numBasesNeeded = origNumBases * threshold; if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file - if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { - alignment->resize(candidateSeq->getUnaligned().length()+1); + if (candidateSeq->getUnaligned().length()+1 > alignment->getnRows()) { + if (m->debug) { m->mothurOut("[DEBUG]: " + candidateSeq->getName() + " " + toString(candidateSeq->getUnaligned().length()) + " " + toString(alignment->getnRows()) + " \n"); } + alignment->resize(candidateSeq->getUnaligned().length()+2); } Sequence temp = templateDB->findClosestSequence(candidateSeq); - Sequence* templateSeq = &temp; + Sequence* templateSeq = new Sequence(temp.getName(), temp.getAligned()); float searchScore = templateDB->getSearchScore(); @@ -597,19 +595,26 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam //get reverse compliment copy = new Sequence(candidateSeq->getName(), originalUnaligned); copy->reverseComplement(); + + if (m->debug) { m->mothurOut("[DEBUG]: flipping " + candidateSeq->getName() + " \n"); } //rerun alignment Sequence temp2 = templateDB->findClosestSequence(copy); - Sequence* templateSeq2 = &temp2; + Sequence* templateSeq2 = new Sequence(temp2.getName(), temp2.getAligned()); + + if (m->debug) { m->mothurOut("[DEBUG]: closest template " + temp2.getName() + " \n"); } searchScore = templateDB->getSearchScore(); nast2 = new Nast(alignment, copy, templateSeq2); + + if (m->debug) { m->mothurOut("[DEBUG]: completed Nast2 " + candidateSeq->getName() + " flipped numBases = " + toString(copy->getNumBases()) + " old numbases = " + toString(candidateSeq->getNumBases()) +" \n"); } //check if any better if (copy->getNumBases() > candidateSeq->getNumBases()) { candidateSeq->setAligned(copy->getAligned()); //use reverse compliments alignment since its better - templateSeq = templateSeq2; + delete templateSeq; + templateSeq = templateSeq2; delete nast; nast = nast2; needToDeleteCopy = true; @@ -617,8 +622,10 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam }else{ wasBetter = "\treverse complement did NOT produce a better alignment so it was not used, please check sequence."; delete nast2; + delete templateSeq2; delete copy; } + if (m->debug) { m->mothurOut("[DEBUG]: done.\n"); } } //create accnos file with names @@ -634,6 +641,7 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam report.print(); delete nast; + delete templateSeq; if (needToDeleteCopy) { delete copy; } count++; @@ -648,11 +656,11 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam #endif //report progress - if((count) % 100 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOutJustToScreen(toString(count) + "\n"); } } //report progress - if((count) % 100 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + if((count) % 100 != 0){ m->mothurOutJustToScreen(toString(count) + "\n"); } delete alignment; alignmentFile.close(); @@ -738,7 +746,7 @@ int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& align } Sequence temp = templateDB->findClosestSequence(candidateSeq); - Sequence* templateSeq = &temp; + Sequence* templateSeq = new Sequence(temp.getName(), temp.getAligned()); float searchScore = templateDB->getSearchScore(); @@ -763,7 +771,7 @@ int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& align //rerun alignment Sequence temp2 = templateDB->findClosestSequence(copy); - Sequence* templateSeq2 = &temp2; + Sequence* templateSeq2 = new Sequence(temp2.getName(), temp2.getAligned()); searchScore = templateDB->getSearchScore(); @@ -772,7 +780,8 @@ int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& align //check if any better if (copy->getNumBases() > candidateSeq->getNumBases()) { candidateSeq->setAligned(copy->getAligned()); //use reverse compliments alignment since its better - templateSeq = templateSeq2; + delete templateSeq; + templateSeq = templateSeq2; delete nast; nast = nast2; needToDeleteCopy = true; @@ -780,6 +789,7 @@ int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& align }else{ wasBetter = "\treverse complement did NOT produce a better alignment, please check sequence."; delete nast2; + delete templateSeq2; delete copy; } } @@ -825,6 +835,7 @@ int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& align delete buf3; delete nast; + delete templateSeq; if (needToDeleteCopy) { delete copy; } } delete candidateSeq; @@ -849,34 +860,72 @@ int AlignCommand::createProcesses(string alignFileName, string reportFileName, s try { int num = 0; processIDS.resize(0); + bool recalc = false; + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 1; //loop through and create all the processes you want while (process != processors) { - int pid = fork(); + pid_t pid = fork(); if (pid > 0) { processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driver(lines[process], alignFileName + toString(getpid()) + ".temp", reportFileName + toString(getpid()) + ".temp", accnosFName + toString(getpid()) + ".temp", filename); + num = driver(lines[process], alignFileName + toString(m->mothurGetpid(process)) + ".temp", reportFileName + toString(m->mothurGetpid(process)) + ".temp", accnosFName + m->mothurGetpid(process) + ".temp", filename); //pass numSeqs to parent ofstream out; - string tempFile = alignFileName + toString(getpid()) + ".num.temp"; + string tempFile = alignFileName + toString(m->mothurGetpid(process)) + ".num.temp"; m->openOutputFile(tempFile, out); out << num << endl; out.close(); exit(0); }else { - m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); - for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } - exit(0); + m->mothurOut("[ERROR]: unable to spawn the number of processes you requested, reducing number to " + toString(process) + "\n"); processors = process; + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + recalc = true; + break; } } + if (recalc) { + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + vector positions; + positions = m->divideFile(filename, processors); + for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); } + + num = 0; + processIDS.resize(0); + process = 1; + + while (process != processors) { + pid_t pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + num = driver(lines[process], alignFileName + toString(m->mothurGetpid(process)) + ".temp", reportFileName + toString(m->mothurGetpid(process)) + ".temp", accnosFName + m->mothurGetpid(process) + ".temp", filename); + + //pass numSeqs to parent + ofstream out; + string tempFile = alignFileName + toString(m->mothurGetpid(process)) + ".num.temp"; + m->openOutputFile(tempFile, out); + out << num << endl; + out.close(); + + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } + } + //do my part num = driver(lines[0], alignFileName, reportFileName, accnosFName, filename); @@ -969,6 +1018,9 @@ int AlignCommand::createProcesses(string alignFileName, string reportFileName, s //Close all thread handles and free memory allocations. for(int i=0; i < pDataArray.size(); i++){ + if (pDataArray[i]->count != pDataArray[i]->end) { + m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; + } num += pDataArray[i]->count; CloseHandle(hThreadArray[i]); delete pDataArray[i];