]> git.donarmstrong.com Git - mothur.git/blob - pairwiseseqscommand.h
added load.logfile command. changed summary.single output for subsample=t.
[mothur.git] / pairwiseseqscommand.h
1 #ifndef PAIRWISESEQSCOMMAND_H
2 #define PAIRWISESEQSCOMMAND_H
3
4 /*
5  *  pairwiseseqscommand.h
6  *  Mothur
7  *
8  *  Created by westcott on 10/20/10.
9  *  Copyright 2010 Schloss Lab. All rights reserved.
10  *
11  */
12
13 #include "mothur.h"
14 #include "command.hpp"
15 #include "database.hpp"
16 #include "alignment.hpp"
17 #include "validcalculator.h"
18 #include "dist.h"
19 #include "sequencedb.h"
20 #include "sequence.hpp"
21
22 #include "gotohoverlap.hpp"
23 #include "needlemanoverlap.hpp"
24 #include "blastalign.hpp"
25 #include "noalign.hpp"
26
27 #include "ignoregaps.h"
28 #include "eachgapdist.h"
29 #include "eachgapignore.h"
30 #include "onegapdist.h"
31 #include "onegapignore.h"
32
33 class PairwiseSeqsCommand : public Command {
34         
35 public:
36         PairwiseSeqsCommand(string);    
37         PairwiseSeqsCommand();
38         ~PairwiseSeqsCommand() {}
39         
40         vector<string> setParameters();
41         string getCommandName()                 { return "pairwise.seqs";               }
42         string getCommandCategory()             { return "Sequence Processing"; }
43         string getOutputFileNameTag(string, string);
44         string getHelpString(); 
45         string getCitation() { return "Needleman SB, Wunsch CD (1970). A general method applicable to the search for similarities in the amino acid sequence of two proteins. J Mol Biol 48: 443-53. [ for needleman ]\nGotoh O (1982). An improved algorithm for matching biological sequences. J Mol Biol 162: 705-8. [ for gotoh ] \nhttp://www.mothur.org/wiki/Pairwise.seqs"; }
46         string getDescription()         { return "calculates pairwise distances from an unaligned fasta file"; }
47
48         int execute(); 
49         void help() { m->mothurOut(getHelpString()); }  
50         
51 private:
52         struct distlinePair {
53                 int start;
54                 int end;
55         };
56         
57         vector<int> processIDS;   //end line, processid
58         vector<distlinePair> lines;
59         
60         SequenceDB alignDB;
61         
62         void createProcesses(string);
63         int driver(int, int, string, float);
64         int driver(int, int, string, string);
65         
66         #ifdef USE_MPI 
67         int driverMPI(int, int, MPI_File&, float);
68         int driverMPI(int, int, string, unsigned long long&);
69         int driverMPI(int, int, string, unsigned long long&, string);
70         #endif
71         
72         string fastaFileName, align, calc, outputDir, output;
73         float match, misMatch, gapOpen, gapExtend, cutoff;
74         int processors, longestBase;
75         vector<string> fastaFileNames, Estimators;
76         vector<string> outputNames;
77         
78         bool abort, countends, compress;
79 };
80
81 /**************************************************************************************************/
82 //custom data structure for threads to use.
83 // This is passed by void pointer so it can be any data type
84 // that can be passed using a single void pointer (LPVOID).
85 struct pairwiseData {
86     string outputFileName;
87         string align, square, distcalcType, output;
88         unsigned long long start;
89         unsigned long long end;
90         MothurOut* m;
91         float match, misMatch, gapOpen, gapExtend, cutoff;
92         int count, threadID, longestBase;
93     bool countends;
94     SequenceDB alignDB;
95         
96         pairwiseData(){}
97         pairwiseData(string ofn, string al, string sq, string di, bool co, string op, SequenceDB DB, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int thr, int tid) {
98                 outputFileName = ofn;
99                 m = mout;
100                 start = st;
101                 end = en;
102                 match = ma; 
103                 misMatch = misMa;
104                 gapOpen = gapO; 
105                 gapExtend = gapE; 
106                 longestBase = thr;
107                 align = al;
108         square = sq;
109         distcalcType = di;
110         countends = co;
111         alignDB = DB;
112                 count = 0;
113         output = op;
114                 threadID = tid;
115         }
116 };
117
118 /**************************************************************************************************/
119 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
120 #else
121 static DWORD WINAPI MyPairwiseSquareThreadFunction(LPVOID lpParam){ 
122         pairwiseData* pDataArray;
123         pDataArray = (pairwiseData*)lpParam;
124         
125         try {
126                 ofstream outFile((pDataArray->outputFileName).c_str(), ios::trunc);
127                 outFile.setf(ios::fixed, ios::showpoint);
128                 outFile << setprecision(4);
129                 
130                 pDataArray->count = pDataArray->end;
131         
132         int startTime = time(NULL);
133         
134         Alignment* alignment;
135         if(pDataArray->align == "gotoh")                        {       alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);                     }
136                 else if(pDataArray->align == "needleman")       {       alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);                                }
137                 else if(pDataArray->align == "blast")           {       alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch);            }
138                 else if(pDataArray->align == "noalign")         {       alignment = new NoAlign();                                                                                                      }
139                 else {
140                         pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman.");
141                         pDataArray->m->mothurOutEndLine();
142                         alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);
143                 }
144                 
145         ValidCalculators validCalculator;
146         Dist* distCalculator;
147         if (pDataArray->countends) {
148             if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) { 
149                 if (pDataArray->distcalcType == "nogaps")                       {       distCalculator = new ignoreGaps();      }
150                 else if (pDataArray->distcalcType == "eachgap") {       distCalculator = new eachGapDist();     }
151                 else if (pDataArray->distcalcType == "onegap")          {       distCalculator = new oneGapDist();      }
152             }
153         }else {
154             if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) { 
155                 if (pDataArray->distcalcType == "nogaps")               {       distCalculator = new ignoreGaps();                                      }
156                 else if (pDataArray->distcalcType == "eachgap"){        distCalculator = new eachGapIgnoreTermGapDist();        }
157                 else if (pDataArray->distcalcType == "onegap")  {       distCalculator = new oneGapIgnoreTermGapDist();         }
158             }
159         }
160
161         if(pDataArray->start == 0){     outFile << pDataArray->alignDB.getNumSeqs() << endl;    }
162                 
163                 for(int i=pDataArray->start;i<pDataArray->end;i++){
164             
165                         string name = pDataArray->alignDB.get(i).getName();
166                         //pad with spaces to make compatible
167                         if (name.length() < 10) { while (name.length() < 10) {  name += " ";  } }
168             
169                         outFile << name << '\t';        
170                         
171                         for(int j=0;j<pDataArray->alignDB.getNumSeqs();j++){
172                                 
173                                 if (pDataArray->m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0;  }
174                                 
175                                 if (pDataArray->alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
176                                         alignment->resize(pDataArray->alignDB.get(i).getUnaligned().length()+1);
177                                 }
178                                 
179                                 if (pDataArray->alignDB.get(j).getUnaligned().length() > alignment->getnRows()) {
180                                         alignment->resize(pDataArray->alignDB.get(j).getUnaligned().length()+1);
181                                 }
182                                 
183                                 Sequence seqI(pDataArray->alignDB.get(i).getName(), pDataArray->alignDB.get(i).getAligned());
184                                 Sequence seqJ(pDataArray->alignDB.get(j).getName(), pDataArray->alignDB.get(j).getAligned());
185                                 
186                                 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
187                                 seqI.setAligned(alignment->getSeqAAln());
188                                 seqJ.setAligned(alignment->getSeqBAln());
189                                 
190                                 distCalculator->calcDist(seqI, seqJ);
191                                 double dist = distCalculator->getDist();
192                 
193                                 outFile << dist << '\t'; 
194                         }
195                         
196                         outFile << endl; 
197                         
198                         if(i % 100 == 0){
199                                 pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
200                         }
201                         
202                 }
203                 pDataArray->m->mothurOut(toString(pDataArray->end-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
204                 
205                 outFile.close();
206         delete alignment;
207         delete distCalculator;
208
209         
210     }
211         catch(exception& e) {
212                 pDataArray->m->errorOut(e, "PairwiseSeqsCommand", "MyPairwiseSquareThreadFunction");
213                 exit(1);
214         }
215
216
217 /**************************************************************************************************/
218 static DWORD WINAPI MyPairwiseThreadFunction(LPVOID lpParam){ 
219         pairwiseData* pDataArray;
220         pDataArray = (pairwiseData*)lpParam;
221         
222         try {
223                 ofstream outFile((pDataArray->outputFileName).c_str(), ios::trunc);
224                 outFile.setf(ios::fixed, ios::showpoint);
225                 outFile << setprecision(4);
226                 
227         pDataArray->count = pDataArray->end;
228         
229         int startTime = time(NULL);
230         
231         Alignment* alignment;
232         if(pDataArray->align == "gotoh")                        {       alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);                     }
233                 else if(pDataArray->align == "needleman")       {       alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);                                }
234                 else if(pDataArray->align == "blast")           {       alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch);            }
235                 else if(pDataArray->align == "noalign")         {       alignment = new NoAlign();                                                                                                      }
236                 else {
237                         pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman.");
238                         pDataArray->m->mothurOutEndLine();
239                         alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);
240                 }
241                 
242         ValidCalculators validCalculator;
243         Dist* distCalculator;
244         if (pDataArray->countends) {
245             if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) { 
246                 if (pDataArray->distcalcType == "nogaps")                       {       distCalculator = new ignoreGaps();      }
247                 else if (pDataArray->distcalcType == "eachgap") {       distCalculator = new eachGapDist();     }
248                 else if (pDataArray->distcalcType == "onegap")          {       distCalculator = new oneGapDist();      }
249             }
250         }else {
251             if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) { 
252                 if (pDataArray->distcalcType == "nogaps")               {       distCalculator = new ignoreGaps();                                      }
253                 else if (pDataArray->distcalcType == "eachgap"){        distCalculator = new eachGapIgnoreTermGapDist();        }
254                 else if (pDataArray->distcalcType == "onegap")  {       distCalculator = new oneGapIgnoreTermGapDist();         }
255             }
256         }
257         
258         if((pDataArray->output == "lt") && pDataArray->start == 0){     outFile << pDataArray->alignDB.getNumSeqs() << endl;    }
259                 
260                 for(int i=pDataArray->start;i<pDataArray->end;i++){
261             
262                         if(pDataArray->output == "lt")  {       
263                                 string name = pDataArray->alignDB.get(i).getName();
264                                 if (name.length() < 10) { //pad with spaces to make compatible
265                                         while (name.length() < 10) {  name += " ";  }
266                                 }
267                                 outFile << name << '\t';        
268                         }
269
270                         
271                         for(int j=0;j<i;j++){
272                                 
273                                 if (pDataArray->m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0;  }
274                                 
275                                 if (pDataArray->alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
276                                         alignment->resize(pDataArray->alignDB.get(i).getUnaligned().length()+1);
277                                 }
278                                 
279                                 if (pDataArray->alignDB.get(j).getUnaligned().length() > alignment->getnRows()) {
280                                         alignment->resize(pDataArray->alignDB.get(j).getUnaligned().length()+1);
281                                 }
282                                 
283                                 Sequence seqI(pDataArray->alignDB.get(i).getName(), pDataArray->alignDB.get(i).getAligned());
284                                 Sequence seqJ(pDataArray->alignDB.get(j).getName(), pDataArray->alignDB.get(j).getAligned());
285                                 
286                                 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
287                                 seqI.setAligned(alignment->getSeqAAln());
288                                 seqJ.setAligned(alignment->getSeqBAln());
289                                 
290                                 distCalculator->calcDist(seqI, seqJ);
291                                 double dist = distCalculator->getDist();
292                 
293                                 if(dist <= pDataArray->cutoff){
294                                         if (pDataArray->output == "column") { outFile << pDataArray->alignDB.get(i).getName() << ' ' << pDataArray->alignDB.get(j).getName() << ' ' << dist << endl; }
295                                 }
296                                 if (pDataArray->output == "lt") {  outFile << dist << '\t'; }
297                         }
298                         
299                         if (pDataArray->output == "lt") { outFile << endl; }
300                         
301                         if(i % 100 == 0){
302                                 pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
303                         }
304                         
305                 }
306                 pDataArray->m->mothurOut(toString(pDataArray->end-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
307                 
308                 outFile.close();
309         delete alignment;
310         delete distCalculator;
311         
312         
313     }
314         catch(exception& e) {
315                 pDataArray->m->errorOut(e, "PairwiseSeqsCommand", "MyPairwiseThreadFunction");
316                 exit(1);
317         }
318
319
320 #endif
321
322
323 #endif
324