X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=aligncommand.h;h=4e1b8d39f81dc067c60575d1938a21fd1de8dee0;hp=b58c3f29bcc653e48c2d30fc9ae4a57344c4330e;hb=cf9987b67aa49777a4c91c2d21f96e58bf17aa82;hpb=284fd95c611ccc3b1a7875c4dacfca06d1f50ed6 diff --git a/aligncommand.h b/aligncommand.h index b58c3f2..4e1b8d3 100644 --- a/aligncommand.h +++ b/aligncommand.h @@ -1,6 +1,5 @@ #ifndef ALIGNCOMMAND_H #define ALIGNCOMMAND_H - /* * aligncommand.h * Mothur @@ -10,48 +9,261 @@ * */ -#include "mothur.h" #include "command.hpp" #include "database.hpp" #include "alignment.hpp" #include "alignmentdb.h" +#include "sequence.hpp" + +#include "gotohoverlap.hpp" +#include "needlemanoverlap.hpp" +#include "blastalign.hpp" +#include "noalign.hpp" +#include "nast.hpp" +#include "nastreport.hpp" + +//test class AlignCommand : public Command { public: AlignCommand(string); + AlignCommand(); ~AlignCommand(); + + vector setParameters(); + string getCommandName() { return "align.seqs"; } + string getCommandCategory() { return "Sequence Processing"; } + + string getHelpString(); + string getOutputPattern(string); + string getCitation() { return "DeSantis TZ, Jr., Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, Andersen GL (2006). NAST: a multiple sequence alignment server for comparative analysis of 16S rRNA genes. Nucleic Acids Res 34: W394-9.\nSchloss PD (2009). A high-throughput DNA sequence aligner for microbial ecology studies. PLoS ONE 4: e8230.\nSchloss PD (2010). The effects of alignment quality, distance calculation method, sequence filtering, and region on the analysis of 16S rRNA gene-based studies. PLoS Comput Biol 6: e1000844.\nhttp://www.mothur.org/wiki/Align.seqs http://www.mothur.org/wiki/Align.seqs"; } + string getDescription() { return "align sequences"; } + int execute(); - void help(); + void help() { m->mothurOut(getHelpString()); } private: struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; vector processIDS; //processid vector lines; bool MPIWroteAccnos; AlignmentDB* templateDB; - Alignment* alignment; int driver(linePair*, string, string, string, string); int createProcesses(string, string, string, string); - void appendAlignFiles(string, string); void appendReportFiles(string, string); #ifdef USE_MPI - int driverMPI(MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); #endif string candidateFileName, templateFileName, distanceFileName, search, align, outputDir; float match, misMatch, gapOpen, gapExtend, threshold; int processors, kmerSize; vector candidateFileNames; + vector outputNames; + + bool abort, flip, calledHelp, save; + +}; + +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct alignData { + string templateFileName; + string alignFName; + string reportFName; + string accnosFName; + string filename; + string align; + string search; + bool flip; + unsigned long long start; + unsigned long long end; + MothurOut* m; + //AlignmentDB* templateDB; + float match, misMatch, gapOpen, gapExtend, threshold; + int count, kmerSize, threadID; - bool abort, flip; + alignData(){} + alignData(string te, string a, string r, string ac, string f, string al, string se, int ks, MothurOut* mout, unsigned long long st, unsigned long long en, bool fl, float ma, float misMa, float gapO, float gapE, float thr, int tid) { + templateFileName = te; + alignFName = a; + reportFName = r; + accnosFName = ac; + filename = f; + flip = fl; + m = mout; + start = st; + end = en; + //templateDB = tdb; + match = ma; + misMatch = misMa; + gapOpen = gapO; + gapExtend = gapE; + threshold = thr; + align = al; + search = se; + count = 0; + kmerSize = ks; + threadID = tid; + } }; +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else +static DWORD WINAPI MyAlignThreadFunction(LPVOID lpParam){ + alignData* pDataArray; + pDataArray = (alignData*)lpParam; + + try { + ofstream alignmentFile; + pDataArray->m->openOutputFile(pDataArray->alignFName, alignmentFile); + + ofstream accnosFile; + pDataArray->m->openOutputFile(pDataArray->accnosFName, accnosFile); + + NastReport report(pDataArray->reportFName); + + ifstream inFASTA; + pDataArray->m->openInputFile(pDataArray->filename, inFASTA); + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + inFASTA.seekg(0); + }else { //this accounts for the difference in line endings. + inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA); + } + + AlignmentDB* templateDB = new AlignmentDB(pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->threadID); + + //moved this into driver to avoid deep copies in windows paralellized version + Alignment* alignment; + int longestBase = templateDB->getLongestBase(); + if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase); } + else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase); } + else if(pDataArray->align == "blast") { alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch); } + else if(pDataArray->align == "noalign") { alignment = new NoAlign(); } + else { + pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman."); + pDataArray->m->mothurOutEndLine(); + alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase); + } + + pDataArray->count = 0; + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + if (pDataArray->m->control_pressed) { break; } + + Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA); + report.setCandidate(candidateSeq); + + int origNumBases = candidateSeq->getNumBases(); + string originalUnaligned = candidateSeq->getUnaligned(); + int numBasesNeeded = origNumBases * pDataArray->threshold; + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { + alignment->resize(candidateSeq->getUnaligned().length()+1); + } + + Sequence temp = templateDB->findClosestSequence(candidateSeq); + Sequence* templateSeq = &temp; + + float searchScore = templateDB->getSearchScore(); + + Nast* nast = new Nast(alignment, candidateSeq, templateSeq); + + Sequence* copy; + + Nast* nast2; + bool needToDeleteCopy = false; //this is needed in case you have you enter the ifs below + //since nast does not make a copy of hte sequence passed, and it is used by the reporter below + //you can't delete the copy sequence til after you report, but you may choose not to create it in the first place + //so this bool tells you if you need to delete it + + //if there is a possibility that this sequence should be reversed + if (candidateSeq->getNumBases() < numBasesNeeded) { + + string wasBetter = ""; + //if the user wants you to try the reverse + if (pDataArray->flip) { + + //get reverse compliment + copy = new Sequence(candidateSeq->getName(), originalUnaligned); + copy->reverseComplement(); + + //rerun alignment + Sequence temp2 = templateDB->findClosestSequence(copy); + Sequence* templateSeq2 = &temp2; + + searchScore = templateDB->getSearchScore(); + + nast2 = new Nast(alignment, copy, templateSeq2); + + //check if any better + if (copy->getNumBases() > candidateSeq->getNumBases()) { + candidateSeq->setAligned(copy->getAligned()); //use reverse compliments alignment since its better + templateSeq = templateSeq2; + delete nast; + nast = nast2; + needToDeleteCopy = true; + wasBetter = "\treverse complement produced a better alignment, so mothur used the reverse complement."; + }else{ + wasBetter = "\treverse complement did NOT produce a better alignment so it was not used, please check sequence."; + delete nast2; + delete copy; + } + } + + //create accnos file with names + accnosFile << candidateSeq->getName() << wasBetter << endl; + } + + report.setTemplate(templateSeq); + report.setSearchParameters(pDataArray->search, searchScore); + report.setAlignmentParameters(pDataArray->align, alignment); + report.setNastParameters(*nast); + + alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl; + + report.print(); + delete nast; + if (needToDeleteCopy) { delete copy; } + + pDataArray->count++; + } + delete candidateSeq; + + //report progress + if((pDataArray->count) % 100 == 0){ pDataArray->m->mothurOutJustToScreen(toString(pDataArray->count)+"\n"); } + + } + //report progress + if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOutJustToScreen(toString(pDataArray->count)+"\n"); } + + delete alignment; + delete templateDB; + alignmentFile.close(); + inFASTA.close(); + accnosFile.close(); + } + catch(exception& e) { + pDataArray->m->errorOut(e, "AlignCommand", "MyAlignThreadFunction"); + exit(1); + } +} +#endif + + + #endif