X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=aligncommand.cpp;h=96e1a7899c4fec8d769c46aede121bb8de240c07;hb=c82900be3ceed3d9bc491bdc98b1819ef85c1af7;hp=5af8e959a6caa3edd98a33a21fa680bd5815d2dc;hpb=861f46b74c17adec8c6ad6d89f232ae7485797bf;p=mothur.git diff --git a/aligncommand.cpp b/aligncommand.cpp index 5af8e95..96e1a78 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -22,10 +22,6 @@ #include "blastalign.hpp" #include "noalign.hpp" -#include "kmerdb.hpp" -#include "suffixdb.hpp" -#include "blastdb.hpp" - #include "nast.hpp" #include "nastreport.hpp" @@ -43,7 +39,7 @@ AlignCommand::AlignCommand(string option){ else { //valid paramters for this command - string AlignArray[] = {"template","candidate","search","ksize","align","match","mismatch","gapopen","gapextend", "processors"}; + string AlignArray[] = {"template","candidate","search","ksize","align","match","mismatch","gapopen","gapextend", "processors","flip","threshold"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); OptionParser parser(option); @@ -94,6 +90,12 @@ AlignCommand::AlignCommand(string option){ temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; } convert(temp, processors); + temp = validParameter.validFile(parameters, "flip", false); if (temp == "not found"){ temp = "f"; } + flip = isTrue(temp); + + temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found"){ temp = "0.80"; } + convert(temp, threshold); + search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; } align = validParameter.validFile(parameters, "align", false); if (align == "not found"){ align = "needleman"; } @@ -126,11 +128,15 @@ void AlignCommand::help(){ mothurOut("The template and candidate parameters are required.\n"); mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n"); mothurOut("The align parameter allows you to specify the alignment method to use. Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n"); - mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 7.\n"); + mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"); mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"); mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"); mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n"); mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0.\n"); + mothurOut("The flip parameter is used to specify whether or not you want mothur to try the reverse compement if a sequence falls below the threshold. The default is false.\n"); + mothurOut("The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. \n"); + mothurOut("If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported.\n"); + mothurOut("The default for the threshold parameter is 0.80, meaning at least 80% of the bases must remain or the sequence is reported as potentially reversed.\n"); mothurOut("The align.seqs command should be in the following format: \n"); mothurOut("align.seqs(template=yourTemplateFile, candidate=yourCandidateFile, align=yourAlignmentMethod, search=yourSearchmethod, ksize=yourKmerSize, match=yourMatchBonus, mismatch=yourMismatchpenalty, gapopen=yourGapopenPenalty, gapextend=yourGapExtendPenalty) \n"); mothurOut("Example align.seqs(candidate=candidate.fasta, template=core.filtered, align=kmer, search=gotoh, ksize=8, match=2.0, mismatch=3.0, gapopen=-2.0, gapextend=-1.0)\n"); @@ -149,29 +155,24 @@ int AlignCommand::execute(){ try { if (abort == true) { return 0; } - if(search == "kmer") { templateDB = new KmerDB(templateFileName, kmerSize); } - else if(search == "suffix") { templateDB = new SuffixDB(templateFileName); } - else if(search == "blast") { templateDB = new BlastDB(templateFileName, gapOpen, gapExtend, match, misMatch); } - else { - mothurOut(search + " is not a valid search option. I will run the command using kmer, ksize=8."); - mothurOutEndLine(); - kmerSize = 8; - - templateDB = new KmerDB(templateFileName, kmerSize); - } - - if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, 3000); } - else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000); } + templateDB = new AlignmentDB(templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch); + int longestBase = templateDB->getLongestBase(); + + if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); } + else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } else if(align == "noalign") { alignment = new NoAlign(); } else { mothurOut(align + " is not a valid alignment option. I will run the command using needleman."); mothurOutEndLine(); - alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000); + alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } + mothurOut("Aligning sequences..."); + mothurOutEndLine(); string alignFileName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align"; string reportFileName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align.report"; + string accnosFileName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "flip.accnos"; int numFastaSeqs = 0; int start = time(NULL); @@ -184,9 +185,18 @@ int AlignCommand::execute(){ inFASTA.close(); lines.push_back(new linePair(0, numFastaSeqs)); - - driver(lines[0], alignFileName, reportFileName); - + + driver(lines[0], alignFileName, reportFileName, accnosFileName); + + //delete accnos file if its blank else report to user + if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); } + else { + mothurOut("Some of you sequences generated alignments that eliminated too many bases, a list is provided in " + accnosFileName + "."); + if (!flip) { + mothurOut(" If you set the flip parameter to true mothur will try aligning the reverse compliment as well."); + }else{ mothurOut(" If the reverse compliment proved to be better it was reported."); } + mothurOutEndLine(); + } } else{ vector positions; @@ -197,29 +207,54 @@ int AlignCommand::execute(){ string input; while(!inFASTA.eof()){ - getline(inFASTA, input); + input = getline(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); numFastaSeqs = positions.size(); - + int numSeqsPerProcessor = numFastaSeqs / processors; for (int i = 0; i < processors; i++) { - int startPos = positions[ i * numSeqsPerProcessor ]; + long int startPos = positions[ i * numSeqsPerProcessor ]; if(i == processors - 1){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } lines.push_back(new linePair(startPos, numSeqsPerProcessor)); } - createProcesses(alignFileName, reportFileName); + + createProcesses(alignFileName, reportFileName, accnosFileName); rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str()); rename((reportFileName + toString(processIDS[0]) + ".temp").c_str(), reportFileName.c_str()); + vector nonBlankAccnosFiles; + //delete blank accnos files generated with multiple processes + for(int i=0;istart); - + for(int i=0;inumSeqs;i++){ - Sequence* candidateSeq = new Sequence(inFASTA); - report.setCandidate(candidateSeq); - - Sequence temp = templateDB->findClosestSequence(candidateSeq); - Sequence* templateSeq = &temp; - - report.setTemplate(templateSeq); - report.setSearchParameters(search, templateDB->getSearchScore()); - - Nast nast(alignment, candidateSeq, templateSeq); - report.setAlignmentParameters(align, alignment); - report.setNastParameters(nast); - - alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl; - - report.print(); - - delete candidateSeq; + Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); + int origNumBases = candidateSeq->getNumBases(); + string originalUnaligned = candidateSeq->getUnaligned(); + int numBasesNeeded = origNumBases * threshold; + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { + alignment->resize(candidateSeq->getUnaligned().length()+1); + } + + Sequence temp = templateDB->findClosestSequence(candidateSeq); + Sequence* templateSeq = &temp; + + float searchScore = templateDB->getSearchScore(); + + Nast* nast = new Nast(alignment, candidateSeq, templateSeq); + Sequence* copy; + + Nast* nast2; + bool needToDeleteCopy = false; //this is needed in case you have you enter the ifs below + //since nast does not make a copy of hte sequence passed, and it is used by the reporter below + //you can't delete the copy sequence til after you report, but you may choose not to create it in the first place + //so this bool tells you if you need to delete it + + //if there is a possibility that this sequence should be reversed + if (candidateSeq->getNumBases() < numBasesNeeded) { + + string wasBetter = ""; + //if the user wants you to try the reverse + if (flip) { + //get reverse compliment + copy = new Sequence(candidateSeq->getName(), originalUnaligned); + copy->reverseComplement(); + + //rerun alignment + Sequence temp2 = templateDB->findClosestSequence(copy); + Sequence* templateSeq2 = &temp2; + + searchScore = templateDB->getSearchScore(); + + nast2 = new Nast(alignment, copy, templateSeq2); + + //check if any better + if (copy->getNumBases() > candidateSeq->getNumBases()) { + candidateSeq->setAligned(copy->getAligned()); //use reverse compliments alignment since its better + templateSeq = templateSeq2; + delete nast; + nast = nast2; + needToDeleteCopy = true; + }else{ + wasBetter = "\treverse complement did NOT produce a better alignment, please check sequence."; + delete nast2; + delete copy; + } + } + + //create accnos file with names + accnosFile << candidateSeq->getName() << wasBetter << endl; + } + + report.setCandidate(candidateSeq); + report.setTemplate(templateSeq); + report.setSearchParameters(search, searchScore); + report.setAlignmentParameters(align, alignment); + report.setNastParameters(*nast); + + alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl; + + report.print(); + delete nast; + if (needToDeleteCopy) { delete copy; } + } + delete candidateSeq; } alignmentFile.close(); inFASTA.close(); + accnosFile.close(); return 1; } @@ -299,7 +408,7 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName){ /**************************************************************************************************/ -void AlignCommand::createProcesses(string alignFileName, string reportFileName) { +void AlignCommand::createProcesses(string alignFileName, string reportFileName, string accnosFName) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; @@ -313,7 +422,7 @@ void AlignCommand::createProcesses(string alignFileName, string reportFileName) processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driver(lines[process], alignFileName + toString(getpid()) + ".temp", reportFileName + toString(getpid()) + ".temp"); + driver(lines[process], alignFileName + toString(getpid()) + ".temp", reportFileName + toString(getpid()) + ".temp", accnosFName + toString(getpid()) + ".temp"); exit(0); }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } } @@ -354,8 +463,7 @@ void AlignCommand::appendAlignFiles(string temp, string filename) { exit(1); } } - -/**************************************************************************************************/ +//********************************************************************************************************************** void AlignCommand::appendReportFiles(string temp, string filename) { try{