]> git.donarmstrong.com Git - mothur.git/blob - pairwiseseqscommand.h
paralellized pairwise.seqs for windows
[mothur.git] / pairwiseseqscommand.h
1 #ifndef PAIRWISESEQSCOMMAND_H
2 #define PAIRWISESEQSCOMMAND_H
3
4 /*
5  *  pairwiseseqscommand.h
6  *  Mothur
7  *
8  *  Created by westcott on 10/20/10.
9  *  Copyright 2010 Schloss Lab. All rights reserved.
10  *
11  */
12
13 #include "mothur.h"
14 #include "command.hpp"
15 #include "database.hpp"
16 #include "alignment.hpp"
17 #include "validcalculator.h"
18 #include "dist.h"
19 #include "sequencedb.h"
20 #include "sequence.hpp"
21
22 #include "gotohoverlap.hpp"
23 #include "needlemanoverlap.hpp"
24 #include "blastalign.hpp"
25 #include "noalign.hpp"
26
27 #include "ignoregaps.h"
28 #include "eachgapdist.h"
29 #include "eachgapignore.h"
30 #include "onegapdist.h"
31 #include "onegapignore.h"
32
33 class PairwiseSeqsCommand : public Command {
34         
35 public:
36         PairwiseSeqsCommand(string);    
37         PairwiseSeqsCommand();
38         ~PairwiseSeqsCommand() {}
39         
40         vector<string> setParameters();
41         string getCommandName()                 { return "pairwise.seqs";               }
42         string getCommandCategory()             { return "Sequence Processing"; }
43         string getHelpString(); 
44         string getCitation() { return "Needleman SB, Wunsch CD (1970). A general method applicable to the search for similarities in the amino acid sequence of two proteins. J Mol Biol 48: 443-53. [ for needleman ]\nGotoh O (1982). An improved algorithm for matching biological sequences. J Mol Biol 162: 705-8. [ for gotoh ] \nhttp://www.mothur.org/wiki/Pairwise.seqs"; }
45         string getDescription()         { return "calculates pairwise distances from an unaligned fasta file"; }
46
47         int execute(); 
48         void help() { m->mothurOut(getHelpString()); }  
49         
50 private:
51         struct distlinePair {
52                 int start;
53                 int end;
54         };
55         
56         vector<int> processIDS;   //end line, processid
57         vector<distlinePair> lines;
58         
59         SequenceDB alignDB;
60         
61         void createProcesses(string);
62         int driver(int, int, string, float);
63         int driver(int, int, string, string);
64         
65         #ifdef USE_MPI 
66         int driverMPI(int, int, MPI_File&, float);
67         int driverMPI(int, int, string, unsigned long long&);
68         int driverMPI(int, int, string, unsigned long long&, string);
69         #endif
70         
71         string fastaFileName, align, calc, outputDir, output;
72         float match, misMatch, gapOpen, gapExtend, cutoff;
73         int processors, longestBase;
74         vector<string> fastaFileNames, Estimators;
75         vector<string> outputNames;
76         
77         bool abort, countends, compress;
78 };
79
80 /**************************************************************************************************/
81 //custom data structure for threads to use.
82 // This is passed by void pointer so it can be any data type
83 // that can be passed using a single void pointer (LPVOID).
84 struct pairwiseData {
85     string outputFileName;
86         string align, square, distcalcType, output;
87         unsigned long long start;
88         unsigned long long end;
89         MothurOut* m;
90         float match, misMatch, gapOpen, gapExtend, cutoff;
91         int count, threadID, longestBase;
92     bool countends;
93     SequenceDB alignDB;
94         
95         pairwiseData(){}
96         pairwiseData(string ofn, string al, string sq, string di, bool co, string op, SequenceDB DB, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int thr, int tid) {
97                 outputFileName = ofn;
98                 m = mout;
99                 start = st;
100                 end = en;
101                 match = ma; 
102                 misMatch = misMa;
103                 gapOpen = gapO; 
104                 gapExtend = gapE; 
105                 longestBase = thr;
106                 align = al;
107         square = sq;
108         distcalcType = di;
109         countends = co;
110         alignDB = DB;
111                 count = 0;
112         output = op;
113                 threadID = tid;
114         }
115 };
116
117 /**************************************************************************************************/
118 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
119 #else
120 static DWORD WINAPI MyPairwiseSquareThreadFunction(LPVOID lpParam){ 
121         pairwiseData* pDataArray;
122         pDataArray = (pairwiseData*)lpParam;
123         
124         try {
125                 ofstream outFile((pDataArray->outputFileName).c_str(), ios::trunc);
126                 outFile.setf(ios::fixed, ios::showpoint);
127                 outFile << setprecision(4);
128                 
129                 pDataArray->count = pDataArray->end;
130         
131         int startTime = time(NULL);
132         
133         Alignment* alignment;
134         if(pDataArray->align == "gotoh")                        {       alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);                     }
135                 else if(pDataArray->align == "needleman")       {       alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);                                }
136                 else if(pDataArray->align == "blast")           {       alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch);            }
137                 else if(pDataArray->align == "noalign")         {       alignment = new NoAlign();                                                                                                      }
138                 else {
139                         pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman.");
140                         pDataArray->m->mothurOutEndLine();
141                         alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);
142                 }
143                 
144         ValidCalculators validCalculator;
145         Dist* distCalculator;
146         if (pDataArray->countends) {
147             if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) { 
148                 if (pDataArray->distcalcType == "nogaps")                       {       distCalculator = new ignoreGaps();      }
149                 else if (pDataArray->distcalcType == "eachgap") {       distCalculator = new eachGapDist();     }
150                 else if (pDataArray->distcalcType == "onegap")          {       distCalculator = new oneGapDist();      }
151             }
152         }else {
153             if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) { 
154                 if (pDataArray->distcalcType == "nogaps")               {       distCalculator = new ignoreGaps();                                      }
155                 else if (pDataArray->distcalcType == "eachgap"){        distCalculator = new eachGapIgnoreTermGapDist();        }
156                 else if (pDataArray->distcalcType == "onegap")  {       distCalculator = new oneGapIgnoreTermGapDist();         }
157             }
158         }
159
160         if(pDataArray->start == 0){     outFile << pDataArray->alignDB.getNumSeqs() << endl;    }
161                 
162                 for(int i=pDataArray->start;i<pDataArray->end;i++){
163             
164                         string name = pDataArray->alignDB.get(i).getName();
165                         //pad with spaces to make compatible
166                         if (name.length() < 10) { while (name.length() < 10) {  name += " ";  } }
167             
168                         outFile << name << '\t';        
169                         
170                         for(int j=0;j<pDataArray->alignDB.getNumSeqs();j++){
171                                 
172                                 if (pDataArray->m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0;  }
173                                 
174                                 if (pDataArray->alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
175                                         alignment->resize(pDataArray->alignDB.get(i).getUnaligned().length()+1);
176                                 }
177                                 
178                                 if (pDataArray->alignDB.get(j).getUnaligned().length() > alignment->getnRows()) {
179                                         alignment->resize(pDataArray->alignDB.get(j).getUnaligned().length()+1);
180                                 }
181                                 
182                                 Sequence seqI(pDataArray->alignDB.get(i).getName(), pDataArray->alignDB.get(i).getAligned());
183                                 Sequence seqJ(pDataArray->alignDB.get(j).getName(), pDataArray->alignDB.get(j).getAligned());
184                                 
185                                 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
186                                 seqI.setAligned(alignment->getSeqAAln());
187                                 seqJ.setAligned(alignment->getSeqBAln());
188                                 
189                                 distCalculator->calcDist(seqI, seqJ);
190                                 double dist = distCalculator->getDist();
191                 
192                                 outFile << dist << '\t'; 
193                         }
194                         
195                         outFile << endl; 
196                         
197                         if(i % 100 == 0){
198                                 pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
199                         }
200                         
201                 }
202                 pDataArray->m->mothurOut(toString(pDataArray->end-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
203                 
204                 outFile.close();
205         delete alignment;
206         delete distCalculator;
207
208         
209     }
210         catch(exception& e) {
211                 pDataArray->m->errorOut(e, "PairwiseSeqsCommand", "MyPairwiseSquareThreadFunction");
212                 exit(1);
213         }
214
215
216 /**************************************************************************************************/
217 static DWORD WINAPI MyPairwiseThreadFunction(LPVOID lpParam){ 
218         pairwiseData* pDataArray;
219         pDataArray = (pairwiseData*)lpParam;
220         
221         try {
222                 ofstream outFile((pDataArray->outputFileName).c_str(), ios::trunc);
223                 outFile.setf(ios::fixed, ios::showpoint);
224                 outFile << setprecision(4);
225                 
226         pDataArray->count = pDataArray->end;
227         
228         int startTime = time(NULL);
229         
230         Alignment* alignment;
231         if(pDataArray->align == "gotoh")                        {       alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);                     }
232                 else if(pDataArray->align == "needleman")       {       alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);                                }
233                 else if(pDataArray->align == "blast")           {       alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch);            }
234                 else if(pDataArray->align == "noalign")         {       alignment = new NoAlign();                                                                                                      }
235                 else {
236                         pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman.");
237                         pDataArray->m->mothurOutEndLine();
238                         alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);
239                 }
240                 
241         ValidCalculators validCalculator;
242         Dist* distCalculator;
243         if (pDataArray->countends) {
244             if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) { 
245                 if (pDataArray->distcalcType == "nogaps")                       {       distCalculator = new ignoreGaps();      }
246                 else if (pDataArray->distcalcType == "eachgap") {       distCalculator = new eachGapDist();     }
247                 else if (pDataArray->distcalcType == "onegap")          {       distCalculator = new oneGapDist();      }
248             }
249         }else {
250             if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) { 
251                 if (pDataArray->distcalcType == "nogaps")               {       distCalculator = new ignoreGaps();                                      }
252                 else if (pDataArray->distcalcType == "eachgap"){        distCalculator = new eachGapIgnoreTermGapDist();        }
253                 else if (pDataArray->distcalcType == "onegap")  {       distCalculator = new oneGapIgnoreTermGapDist();         }
254             }
255         }
256         
257         if((pDataArray->output == "lt") && pDataArray->start == 0){     outFile << pDataArray->alignDB.getNumSeqs() << endl;    }
258                 
259                 for(int i=pDataArray->start;i<pDataArray->end;i++){
260             
261                         if(pDataArray->output == "lt")  {       
262                                 string name = pDataArray->alignDB.get(i).getName();
263                                 if (name.length() < 10) { //pad with spaces to make compatible
264                                         while (name.length() < 10) {  name += " ";  }
265                                 }
266                                 outFile << name << '\t';        
267                         }
268
269                         
270                         for(int j=0;j<i;j++){
271                                 
272                                 if (pDataArray->m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0;  }
273                                 
274                                 if (pDataArray->alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
275                                         alignment->resize(pDataArray->alignDB.get(i).getUnaligned().length()+1);
276                                 }
277                                 
278                                 if (pDataArray->alignDB.get(j).getUnaligned().length() > alignment->getnRows()) {
279                                         alignment->resize(pDataArray->alignDB.get(j).getUnaligned().length()+1);
280                                 }
281                                 
282                                 Sequence seqI(pDataArray->alignDB.get(i).getName(), pDataArray->alignDB.get(i).getAligned());
283                                 Sequence seqJ(pDataArray->alignDB.get(j).getName(), pDataArray->alignDB.get(j).getAligned());
284                                 
285                                 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
286                                 seqI.setAligned(alignment->getSeqAAln());
287                                 seqJ.setAligned(alignment->getSeqBAln());
288                                 
289                                 distCalculator->calcDist(seqI, seqJ);
290                                 double dist = distCalculator->getDist();
291                 
292                                 if(dist <= pDataArray->cutoff){
293                                         if (pDataArray->output == "column") { outFile << pDataArray->alignDB.get(i).getName() << ' ' << pDataArray->alignDB.get(j).getName() << ' ' << dist << endl; }
294                                 }
295                                 if (pDataArray->output == "lt") {  outFile << dist << '\t'; }
296                         }
297                         
298                         if (pDataArray->output == "lt") { outFile << endl; }
299                         
300                         if(i % 100 == 0){
301                                 pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
302                         }
303                         
304                 }
305                 pDataArray->m->mothurOut(toString(pDataArray->end-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
306                 
307                 outFile.close();
308         delete alignment;
309         delete distCalculator;
310         
311         
312     }
313         catch(exception& e) {
314                 pDataArray->m->errorOut(e, "PairwiseSeqsCommand", "MyPairwiseThreadFunction");
315                 exit(1);
316         }
317
318
319 #endif
320
321
322 #endif
323