1 #ifndef PAIRWISESEQSCOMMAND_H
2 #define PAIRWISESEQSCOMMAND_H
5 * pairwiseseqscommand.h
8 * Created by westcott on 10/20/10.
9 * Copyright 2010 Schloss Lab. All rights reserved.
14 #include "command.hpp"
15 #include "database.hpp"
16 #include "alignment.hpp"
17 #include "validcalculator.h"
19 #include "sequencedb.h"
20 #include "sequence.hpp"
22 #include "gotohoverlap.hpp"
23 #include "needlemanoverlap.hpp"
24 #include "blastalign.hpp"
25 #include "noalign.hpp"
27 #include "ignoregaps.h"
28 #include "eachgapdist.h"
29 #include "eachgapignore.h"
30 #include "onegapdist.h"
31 #include "onegapignore.h"
33 class PairwiseSeqsCommand : public Command {
36 PairwiseSeqsCommand(string);
37 PairwiseSeqsCommand();
38 ~PairwiseSeqsCommand() {}
40 vector<string> setParameters();
41 string getCommandName() { return "pairwise.seqs"; }
42 string getCommandCategory() { return "Sequence Processing"; }
44 string getHelpString();
45 string getOutputPattern(string);
46 string getCitation() { return "Needleman SB, Wunsch CD (1970). A general method applicable to the search for similarities in the amino acid sequence of two proteins. J Mol Biol 48: 443-53. [ for needleman ]\nGotoh O (1982). An improved algorithm for matching biological sequences. J Mol Biol 162: 705-8. [ for gotoh ] \nhttp://www.mothur.org/wiki/Pairwise.seqs"; }
47 string getDescription() { return "calculates pairwise distances from an unaligned fasta file"; }
50 void help() { m->mothurOut(getHelpString()); }
58 vector<int> processIDS; //end line, processid
59 vector<distlinePair> lines;
63 void createProcesses(string);
64 int driver(int, int, string, float);
65 int driver(int, int, string, string);
68 int driverMPI(int, int, MPI_File&, float);
69 int driverMPI(int, int, string, unsigned long long&);
70 int driverMPI(int, int, string, unsigned long long&, string);
73 string fastaFileName, align, calc, outputDir, output;
74 float match, misMatch, gapOpen, gapExtend, cutoff;
75 int processors, longestBase;
76 vector<string> fastaFileNames, Estimators;
77 vector<string> outputNames;
79 bool abort, countends, compress;
82 /**************************************************************************************************/
83 //custom data structure for threads to use.
84 // This is passed by void pointer so it can be any data type
85 // that can be passed using a single void pointer (LPVOID).
87 string outputFileName;
88 string align, square, distcalcType, output;
89 unsigned long long start;
90 unsigned long long end;
92 float match, misMatch, gapOpen, gapExtend, cutoff;
93 int count, threadID, longestBase;
98 pairwiseData(string ofn, string al, string sq, string di, bool co, string op, SequenceDB DB, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int thr, int tid) {
119 /**************************************************************************************************/
120 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
122 static DWORD WINAPI MyPairwiseSquareThreadFunction(LPVOID lpParam){
123 pairwiseData* pDataArray;
124 pDataArray = (pairwiseData*)lpParam;
127 ofstream outFile((pDataArray->outputFileName).c_str(), ios::trunc);
128 outFile.setf(ios::fixed, ios::showpoint);
129 outFile << setprecision(4);
131 pDataArray->count = 0;
133 int startTime = time(NULL);
135 Alignment* alignment;
136 if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); }
137 else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); }
138 else if(pDataArray->align == "blast") { alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch); }
139 else if(pDataArray->align == "noalign") { alignment = new NoAlign(); }
141 pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman.");
142 pDataArray->m->mothurOutEndLine();
143 alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);
146 ValidCalculators validCalculator;
147 Dist* distCalculator;
148 if (pDataArray->countends) {
149 if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) {
150 if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); }
151 else if (pDataArray->distcalcType == "eachgap") { distCalculator = new eachGapDist(); }
152 else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapDist(); }
155 if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) {
156 if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); }
157 else if (pDataArray->distcalcType == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
158 else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
162 if(pDataArray->start == 0){ outFile << pDataArray->alignDB.getNumSeqs() << endl; }
164 for(int i=pDataArray->start;i<pDataArray->end;i++){
167 string name = pDataArray->alignDB.get(i).getName();
168 //pad with spaces to make compatible
169 if (name.length() < 10) { while (name.length() < 10) { name += " "; } }
171 outFile << name << '\t';
173 for(int j=0;j<pDataArray->alignDB.getNumSeqs();j++){
175 if (pDataArray->m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0; }
177 if (pDataArray->alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
178 alignment->resize(pDataArray->alignDB.get(i).getUnaligned().length()+1);
181 if (pDataArray->alignDB.get(j).getUnaligned().length() > alignment->getnRows()) {
182 alignment->resize(pDataArray->alignDB.get(j).getUnaligned().length()+1);
185 Sequence seqI(pDataArray->alignDB.get(i).getName(), pDataArray->alignDB.get(i).getAligned());
186 Sequence seqJ(pDataArray->alignDB.get(j).getName(), pDataArray->alignDB.get(j).getAligned());
188 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
189 seqI.setAligned(alignment->getSeqAAln());
190 seqJ.setAligned(alignment->getSeqBAln());
192 distCalculator->calcDist(seqI, seqJ);
193 double dist = distCalculator->getDist();
195 outFile << dist << '\t';
201 pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
205 pDataArray->m->mothurOut(toString(pDataArray->count) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
209 delete distCalculator;
213 catch(exception& e) {
214 pDataArray->m->errorOut(e, "PairwiseSeqsCommand", "MyPairwiseSquareThreadFunction");
219 /**************************************************************************************************/
220 static DWORD WINAPI MyPairwiseThreadFunction(LPVOID lpParam){
221 pairwiseData* pDataArray;
222 pDataArray = (pairwiseData*)lpParam;
225 ofstream outFile((pDataArray->outputFileName).c_str(), ios::trunc);
226 outFile.setf(ios::fixed, ios::showpoint);
227 outFile << setprecision(4);
229 pDataArray->count = pDataArray->end;
231 int startTime = time(NULL);
233 Alignment* alignment;
234 if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); }
235 else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); }
236 else if(pDataArray->align == "blast") { alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch); }
237 else if(pDataArray->align == "noalign") { alignment = new NoAlign(); }
239 pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman.");
240 pDataArray->m->mothurOutEndLine();
241 alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);
244 ValidCalculators validCalculator;
245 Dist* distCalculator;
246 if (pDataArray->countends) {
247 if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) {
248 if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); }
249 else if (pDataArray->distcalcType == "eachgap") { distCalculator = new eachGapDist(); }
250 else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapDist(); }
253 if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) {
254 if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); }
255 else if (pDataArray->distcalcType == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
256 else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
260 if((pDataArray->output == "lt") && pDataArray->start == 0){ outFile << pDataArray->alignDB.getNumSeqs() << endl; }
262 for(int i=pDataArray->start;i<pDataArray->end;i++){
264 if(pDataArray->output == "lt") {
265 string name = pDataArray->alignDB.get(i).getName();
266 if (name.length() < 10) { //pad with spaces to make compatible
267 while (name.length() < 10) { name += " "; }
269 outFile << name << '\t';
273 for(int j=0;j<i;j++){
275 if (pDataArray->m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0; }
277 if (pDataArray->alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
278 alignment->resize(pDataArray->alignDB.get(i).getUnaligned().length()+1);
281 if (pDataArray->alignDB.get(j).getUnaligned().length() > alignment->getnRows()) {
282 alignment->resize(pDataArray->alignDB.get(j).getUnaligned().length()+1);
285 Sequence seqI(pDataArray->alignDB.get(i).getName(), pDataArray->alignDB.get(i).getAligned());
286 Sequence seqJ(pDataArray->alignDB.get(j).getName(), pDataArray->alignDB.get(j).getAligned());
288 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
289 seqI.setAligned(alignment->getSeqAAln());
290 seqJ.setAligned(alignment->getSeqBAln());
292 distCalculator->calcDist(seqI, seqJ);
293 double dist = distCalculator->getDist();
295 if(dist <= pDataArray->cutoff){
296 if (pDataArray->output == "column") { outFile << pDataArray->alignDB.get(i).getName() << ' ' << pDataArray->alignDB.get(j).getName() << ' ' << dist << endl; }
298 if (pDataArray->output == "lt") { outFile << dist << '\t'; }
301 if (pDataArray->output == "lt") { outFile << endl; }
304 pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
308 pDataArray->m->mothurOut(toString(pDataArray->end-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
312 delete distCalculator;
316 catch(exception& e) {
317 pDataArray->m->errorOut(e, "PairwiseSeqsCommand", "MyPairwiseThreadFunction");