7 * Created by Sarah Westcott on 5/15/09.
8 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
12 #include "command.hpp"
13 #include "database.hpp"
14 #include "alignment.hpp"
15 #include "alignmentdb.h"
16 #include "sequence.hpp"
18 #include "gotohoverlap.hpp"
19 #include "needlemanoverlap.hpp"
20 #include "blastalign.hpp"
21 #include "noalign.hpp"
24 #include "nastreport.hpp"
27 class AlignCommand : public Command {
34 vector<string> setParameters();
35 string getCommandName() { return "align.seqs"; }
36 string getCommandCategory() { return "Sequence Processing"; }
38 string getHelpString();
39 string getOutputPattern(string);
40 string getCitation() { return "DeSantis TZ, Jr., Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, Andersen GL (2006). NAST: a multiple sequence alignment server for comparative analysis of 16S rRNA genes. Nucleic Acids Res 34: W394-9.\nSchloss PD (2009). A high-throughput DNA sequence aligner for microbial ecology studies. PLoS ONE 4: e8230.\nSchloss PD (2010). The effects of alignment quality, distance calculation method, sequence filtering, and region on the analysis of 16S rRNA gene-based studies. PLoS Comput Biol 6: e1000844.\nhttp://www.mothur.org/wiki/Align.seqs http://www.mothur.org/wiki/Align.seqs"; }
41 string getDescription() { return "align sequences"; }
44 void help() { m->mothurOut(getHelpString()); }
48 unsigned long long start;
49 unsigned long long end;
50 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
52 vector<int> processIDS; //processid
53 vector<linePair*> lines;
56 AlignmentDB* templateDB;
58 int driver(linePair*, string, string, string, string);
59 int createProcesses(string, string, string, string);
60 void appendReportFiles(string, string);
63 int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
66 string candidateFileName, templateFileName, distanceFileName, search, align, outputDir;
67 float match, misMatch, gapOpen, gapExtend, threshold;
68 int processors, kmerSize;
69 vector<string> candidateFileNames;
70 vector<string> outputNames;
72 bool abort, flip, calledHelp, save;
76 /**************************************************************************************************/
77 //custom data structure for threads to use.
78 // This is passed by void pointer so it can be any data type
79 // that can be passed using a single void pointer (LPVOID).
81 string templateFileName;
89 unsigned long long start;
90 unsigned long long end;
92 //AlignmentDB* templateDB;
93 float match, misMatch, gapOpen, gapExtend, threshold;
94 int count, kmerSize, threadID;
97 alignData(string te, string a, string r, string ac, string f, string al, string se, int ks, MothurOut* mout, unsigned long long st, unsigned long long en, bool fl, float ma, float misMa, float gapO, float gapE, float thr, int tid) {
98 templateFileName = te;
121 /**************************************************************************************************/
122 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
124 static DWORD WINAPI MyAlignThreadFunction(LPVOID lpParam){
125 alignData* pDataArray;
126 pDataArray = (alignData*)lpParam;
129 ofstream alignmentFile;
130 pDataArray->m->openOutputFile(pDataArray->alignFName, alignmentFile);
133 pDataArray->m->openOutputFile(pDataArray->accnosFName, accnosFile);
135 NastReport report(pDataArray->reportFName);
138 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
140 //print header if you are process 0
141 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
143 }else { //this accounts for the difference in line endings.
144 inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
147 AlignmentDB* templateDB = new AlignmentDB(pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->threadID);
149 //moved this into driver to avoid deep copies in windows paralellized version
150 Alignment* alignment;
151 int longestBase = templateDB->getLongestBase();
152 if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase); }
153 else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase); }
154 else if(pDataArray->align == "blast") { alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch); }
155 else if(pDataArray->align == "noalign") { alignment = new NoAlign(); }
157 pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman.");
158 pDataArray->m->mothurOutEndLine();
159 alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase);
162 pDataArray->count = 0;
163 for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
165 if (pDataArray->m->control_pressed) { break; }
167 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
168 report.setCandidate(candidateSeq);
170 int origNumBases = candidateSeq->getNumBases();
171 string originalUnaligned = candidateSeq->getUnaligned();
172 int numBasesNeeded = origNumBases * pDataArray->threshold;
174 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
175 if (candidateSeq->getUnaligned().length() > alignment->getnRows()) {
176 alignment->resize(candidateSeq->getUnaligned().length()+1);
179 Sequence temp = templateDB->findClosestSequence(candidateSeq);
180 Sequence* templateSeq = &temp;
182 float searchScore = templateDB->getSearchScore();
184 Nast* nast = new Nast(alignment, candidateSeq, templateSeq);
189 bool needToDeleteCopy = false; //this is needed in case you have you enter the ifs below
190 //since nast does not make a copy of hte sequence passed, and it is used by the reporter below
191 //you can't delete the copy sequence til after you report, but you may choose not to create it in the first place
192 //so this bool tells you if you need to delete it
194 //if there is a possibility that this sequence should be reversed
195 if (candidateSeq->getNumBases() < numBasesNeeded) {
197 string wasBetter = "";
198 //if the user wants you to try the reverse
199 if (pDataArray->flip) {
201 //get reverse compliment
202 copy = new Sequence(candidateSeq->getName(), originalUnaligned);
203 copy->reverseComplement();
206 Sequence temp2 = templateDB->findClosestSequence(copy);
207 Sequence* templateSeq2 = &temp2;
209 searchScore = templateDB->getSearchScore();
211 nast2 = new Nast(alignment, copy, templateSeq2);
213 //check if any better
214 if (copy->getNumBases() > candidateSeq->getNumBases()) {
215 candidateSeq->setAligned(copy->getAligned()); //use reverse compliments alignment since its better
216 templateSeq = templateSeq2;
219 needToDeleteCopy = true;
220 wasBetter = "\treverse complement produced a better alignment, so mothur used the reverse complement.";
222 wasBetter = "\treverse complement did NOT produce a better alignment so it was not used, please check sequence.";
228 //create accnos file with names
229 accnosFile << candidateSeq->getName() << wasBetter << endl;
232 report.setTemplate(templateSeq);
233 report.setSearchParameters(pDataArray->search, searchScore);
234 report.setAlignmentParameters(pDataArray->align, alignment);
235 report.setNastParameters(*nast);
237 alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl;
241 if (needToDeleteCopy) { delete copy; }
248 if((pDataArray->count) % 100 == 0){ pDataArray->m->mothurOutJustToScreen(toString(pDataArray->count)+"\n"); }
252 if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOutJustToScreen(toString(pDataArray->count)+"\n"); }
256 alignmentFile.close();
260 catch(exception& e) {
261 pDataArray->m->errorOut(e, "AlignCommand", "MyAlignThreadFunction");