]> git.donarmstrong.com Git - mothur.git/blob - aligncommand.h
added debug flag to mothurOut. added debug parameter to set.dir. added debug code...
[mothur.git] / aligncommand.h
1 #ifndef ALIGNCOMMAND_H
2 #define ALIGNCOMMAND_H
3 //test
4 /*
5  *  aligncommand.h
6  *  Mothur
7  *
8  *  Created by Sarah Westcott on 5/15/09.
9  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10  *
11  */
12
13 #include "command.hpp"
14 #include "database.hpp"
15 #include "alignment.hpp"
16 #include "alignmentdb.h"
17 #include "sequence.hpp"
18
19 #include "gotohoverlap.hpp"
20 #include "needlemanoverlap.hpp"
21 #include "blastalign.hpp"
22 #include "noalign.hpp"
23
24 #include "nast.hpp"
25 #include "nastreport.hpp"
26
27 //test
28 class AlignCommand : public Command {
29         
30 public:
31         AlignCommand(string);   
32         AlignCommand();
33         ~AlignCommand();
34         
35         vector<string> setParameters();
36         string getCommandName()                 { return "align.seqs";                  }
37         string getCommandCategory()             { return "Sequence Processing"; }
38         string getHelpString(); 
39         string getCitation() { return "DeSantis TZ, Jr., Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, Andersen GL (2006). NAST: a multiple sequence alignment server for comparative analysis of 16S rRNA genes. Nucleic Acids Res 34: W394-9.\nSchloss PD (2009). A high-throughput DNA sequence aligner for microbial ecology studies. PLoS ONE 4: e8230.\nSchloss PD (2010). The effects of alignment quality, distance calculation method, sequence filtering, and region on the analysis of 16S rRNA gene-based studies. PLoS Comput Biol 6: e1000844.\nhttp://www.mothur.org/wiki/Align.seqs http://www.mothur.org/wiki/Align.seqs"; }
40         string getDescription()         { return "align sequences"; }
41         
42         int execute(); 
43         void help() { m->mothurOut(getHelpString()); }  
44         
45 private:
46         struct linePair {
47                 unsigned long long start;
48                 unsigned long long end;
49                 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
50         };
51         vector<int> processIDS;   //processid
52         vector<linePair*> lines;
53         bool MPIWroteAccnos;
54         
55         AlignmentDB* templateDB;
56         
57         int driver(linePair*, string, string, string, string);
58         int createProcesses(string, string, string, string);
59         void appendAlignFiles(string, string); 
60         void appendReportFiles(string, string);
61         
62         #ifdef USE_MPI
63         int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
64         #endif
65         
66         string candidateFileName, templateFileName, distanceFileName, search, align, outputDir;
67         float match, misMatch, gapOpen, gapExtend, threshold;
68         int processors, kmerSize;
69         vector<string> candidateFileNames;
70         vector<string> outputNames;
71         
72         bool abort, flip, calledHelp, save;
73
74 };
75
76 /**************************************************************************************************/
77 //custom data structure for threads to use.
78 // This is passed by void pointer so it can be any data type
79 // that can be passed using a single void pointer (LPVOID).
80 struct alignData {
81         string templateFileName;
82         string alignFName; 
83         string reportFName; 
84         string accnosFName;
85         string filename;
86         string align;
87         string search;
88         bool flip;
89         unsigned long long start;
90         unsigned long long end;
91         MothurOut* m;
92         //AlignmentDB* templateDB;
93         float match, misMatch, gapOpen, gapExtend, threshold;
94         int count, kmerSize, threadID;
95         
96         alignData(){}
97         alignData(string te, string a, string r, string ac, string f, string al, string se, int ks, MothurOut* mout, unsigned long long st, unsigned long long en, bool fl, float ma, float misMa, float gapO, float gapE, float thr, int tid) {
98                 templateFileName = te;
99                 alignFName = a;
100                 reportFName = r;
101                 accnosFName = ac;
102                 filename = f;
103                 flip = fl;
104                 m = mout;
105                 start = st;
106                 end = en;
107                 //templateDB = tdb;
108                 match = ma; 
109                 misMatch = misMa;
110                 gapOpen = gapO; 
111                 gapExtend = gapE; 
112                 threshold = thr;
113                 align = al;
114                 search = se;
115                 count = 0;
116                 kmerSize = ks;
117                 threadID = tid;
118         }
119 };
120
121 /**************************************************************************************************/
122 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
123 #else
124 static DWORD WINAPI MyAlignThreadFunction(LPVOID lpParam){ 
125         alignData* pDataArray;
126         pDataArray = (alignData*)lpParam;
127         
128         try {
129                 ofstream alignmentFile;
130                 pDataArray->m->openOutputFile(pDataArray->alignFName, alignmentFile);
131                 
132                 ofstream accnosFile;
133                 pDataArray->m->openOutputFile(pDataArray->accnosFName, accnosFile);
134                 
135                 NastReport report(pDataArray->reportFName);
136                 
137                 ifstream inFASTA;
138                 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
139                 
140                 //print header if you are process 0
141                 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
142                         inFASTA.seekg(0);
143                 }else { //this accounts for the difference in line endings. 
144                         inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA); 
145                 }
146                 
147                 pDataArray->count = pDataArray->end;
148                 
149                 AlignmentDB* templateDB = new AlignmentDB(pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->threadID);
150                 
151                 //moved this into driver to avoid deep copies in windows paralellized version
152                 Alignment* alignment;
153                 int longestBase = templateDB->getLongestBase();
154                 if(pDataArray->align == "gotoh")                        {       alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase);                 }
155                 else if(pDataArray->align == "needleman")       {       alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase);                            }
156                 else if(pDataArray->align == "blast")           {       alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch);            }
157                 else if(pDataArray->align == "noalign")         {       alignment = new NoAlign();                                                                                                      }
158                 else {
159                         pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman.");
160                         pDataArray->m->mothurOutEndLine();
161                         alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase);
162                 }
163                 
164                 int count = 0;
165                 for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
166                         
167                         if (pDataArray->m->control_pressed) {  break; }
168                         
169                         Sequence* candidateSeq = new Sequence(inFASTA);  pDataArray->m->gobble(inFASTA);
170                         report.setCandidate(candidateSeq);
171                         
172                         int origNumBases = candidateSeq->getNumBases();
173                         string originalUnaligned = candidateSeq->getUnaligned();
174                         int numBasesNeeded = origNumBases * pDataArray->threshold;
175                         
176                         if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
177                                 if (candidateSeq->getUnaligned().length() > alignment->getnRows()) {
178                                         alignment->resize(candidateSeq->getUnaligned().length()+1);
179                                 }
180                                 
181                                 Sequence temp = templateDB->findClosestSequence(candidateSeq);
182                                 Sequence* templateSeq = &temp;
183                                 
184                                 float searchScore = templateDB->getSearchScore();
185                                 
186                                 Nast* nast = new Nast(alignment, candidateSeq, templateSeq);
187                                 
188                                 Sequence* copy;
189                                 
190                                 Nast* nast2;
191                                 bool needToDeleteCopy = false;  //this is needed in case you have you enter the ifs below
192                                 //since nast does not make a copy of hte sequence passed, and it is used by the reporter below
193                                 //you can't delete the copy sequence til after you report, but you may choose not to create it in the first place
194                                 //so this bool tells you if you need to delete it
195                                 
196                                 //if there is a possibility that this sequence should be reversed
197                                 if (candidateSeq->getNumBases() < numBasesNeeded) {
198                                         
199                                         string wasBetter =  "";
200                                         //if the user wants you to try the reverse
201                                         if (pDataArray->flip) {
202                                                 
203                                                 //get reverse compliment
204                                                 copy = new Sequence(candidateSeq->getName(), originalUnaligned);
205                                                 copy->reverseComplement();
206                                                 
207                                                 //rerun alignment
208                                                 Sequence temp2 = templateDB->findClosestSequence(copy);
209                                                 Sequence* templateSeq2 = &temp2;
210                                                 
211                                                 searchScore = templateDB->getSearchScore();
212                                                 
213                                                 nast2 = new Nast(alignment, copy, templateSeq2);
214                                                 
215                                                 //check if any better
216                                                 if (copy->getNumBases() > candidateSeq->getNumBases()) {
217                                                         candidateSeq->setAligned(copy->getAligned());  //use reverse compliments alignment since its better
218                                                         templateSeq = templateSeq2; 
219                                                         delete nast;
220                                                         nast = nast2;
221                                                         needToDeleteCopy = true;
222                                                         wasBetter = "\treverse complement produced a better alignment, so mothur used the reverse complement.";
223                                                 }else{  
224                                                         wasBetter = "\treverse complement did NOT produce a better alignment so it was not used, please check sequence.";
225                                                         delete nast2;
226                                                         delete copy;    
227                                                 }
228                                         }
229                                         
230                                         //create accnos file with names
231                                         accnosFile << candidateSeq->getName() << wasBetter << endl;
232                                 }
233                                 
234                                 report.setTemplate(templateSeq);
235                                 report.setSearchParameters(pDataArray->search, searchScore);
236                                 report.setAlignmentParameters(pDataArray->align, alignment);
237                                 report.setNastParameters(*nast);
238                                 
239                                 alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl;
240                                 
241                                 report.print();
242                                 delete nast;
243                                 if (needToDeleteCopy) {   delete copy;   }
244                                 
245                                 count++;
246                         }
247                         delete candidateSeq;
248                         
249                         //report progress
250                         if((count) % 100 == 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine();           }
251                         
252                 }
253                 //report progress
254                 if((count) % 100 != 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine();           }
255                 
256                 delete alignment;
257                 delete templateDB;
258                 alignmentFile.close();
259                 inFASTA.close();
260                 accnosFile.close();
261         }
262         catch(exception& e) {
263                 pDataArray->m->errorOut(e, "AlignCommand", "MyAlignThreadFunction");
264                 exit(1);
265         }
266
267 #endif
268
269
270
271 #endif