1 #ifndef PAIRWISESEQSCOMMAND_H
2 #define PAIRWISESEQSCOMMAND_H
5 * pairwiseseqscommand.h
8 * Created by westcott on 10/20/10.
9 * Copyright 2010 Schloss Lab. All rights reserved.
14 #include "command.hpp"
15 #include "database.hpp"
16 #include "alignment.hpp"
17 #include "validcalculator.h"
19 #include "sequencedb.h"
20 #include "sequence.hpp"
22 #include "gotohoverlap.hpp"
23 #include "needlemanoverlap.hpp"
24 #include "blastalign.hpp"
25 #include "noalign.hpp"
27 #include "ignoregaps.h"
28 #include "eachgapdist.h"
29 #include "eachgapignore.h"
30 #include "onegapdist.h"
31 #include "onegapignore.h"
33 class PairwiseSeqsCommand : public Command {
36 PairwiseSeqsCommand(string);
37 PairwiseSeqsCommand();
38 ~PairwiseSeqsCommand() {}
40 vector<string> setParameters();
41 string getCommandName() { return "pairwise.seqs"; }
42 string getCommandCategory() { return "Sequence Processing"; }
43 string getHelpString();
44 string getCitation() { return "Needleman SB, Wunsch CD (1970). A general method applicable to the search for similarities in the amino acid sequence of two proteins. J Mol Biol 48: 443-53. [ for needleman ]\nGotoh O (1982). An improved algorithm for matching biological sequences. J Mol Biol 162: 705-8. [ for gotoh ] \nhttp://www.mothur.org/wiki/Pairwise.seqs"; }
45 string getDescription() { return "calculates pairwise distances from an unaligned fasta file"; }
48 void help() { m->mothurOut(getHelpString()); }
56 vector<int> processIDS; //end line, processid
57 vector<distlinePair> lines;
61 void createProcesses(string);
62 int driver(int, int, string, float);
63 int driver(int, int, string, string);
66 int driverMPI(int, int, MPI_File&, float);
67 int driverMPI(int, int, string, unsigned long long&);
68 int driverMPI(int, int, string, unsigned long long&, string);
71 string fastaFileName, align, calc, outputDir, output;
72 float match, misMatch, gapOpen, gapExtend, cutoff;
73 int processors, longestBase;
74 vector<string> fastaFileNames, Estimators;
75 vector<string> outputNames;
77 bool abort, countends, compress;
80 /**************************************************************************************************/
81 //custom data structure for threads to use.
82 // This is passed by void pointer so it can be any data type
83 // that can be passed using a single void pointer (LPVOID).
85 string outputFileName;
86 string align, square, distcalcType, output;
87 unsigned long long start;
88 unsigned long long end;
90 float match, misMatch, gapOpen, gapExtend, cutoff;
91 int count, threadID, longestBase;
96 pairwiseData(string ofn, string al, string sq, string di, bool co, string op, SequenceDB DB, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int thr, int tid) {
117 /**************************************************************************************************/
118 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
120 static DWORD WINAPI MyPairwiseSquareThreadFunction(LPVOID lpParam){
121 pairwiseData* pDataArray;
122 pDataArray = (pairwiseData*)lpParam;
125 ofstream outFile((pDataArray->outputFileName).c_str(), ios::trunc);
126 outFile.setf(ios::fixed, ios::showpoint);
127 outFile << setprecision(4);
129 pDataArray->count = pDataArray->end;
131 int startTime = time(NULL);
133 Alignment* alignment;
134 if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); }
135 else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); }
136 else if(pDataArray->align == "blast") { alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch); }
137 else if(pDataArray->align == "noalign") { alignment = new NoAlign(); }
139 pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman.");
140 pDataArray->m->mothurOutEndLine();
141 alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);
144 ValidCalculators validCalculator;
145 Dist* distCalculator;
146 if (pDataArray->countends) {
147 if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) {
148 if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); }
149 else if (pDataArray->distcalcType == "eachgap") { distCalculator = new eachGapDist(); }
150 else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapDist(); }
153 if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) {
154 if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); }
155 else if (pDataArray->distcalcType == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
156 else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
160 if(pDataArray->start == 0){ outFile << pDataArray->alignDB.getNumSeqs() << endl; }
162 for(int i=pDataArray->start;i<pDataArray->end;i++){
164 string name = pDataArray->alignDB.get(i).getName();
165 //pad with spaces to make compatible
166 if (name.length() < 10) { while (name.length() < 10) { name += " "; } }
168 outFile << name << '\t';
170 for(int j=0;j<pDataArray->alignDB.getNumSeqs();j++){
172 if (pDataArray->m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0; }
174 if (pDataArray->alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
175 alignment->resize(pDataArray->alignDB.get(i).getUnaligned().length()+1);
178 if (pDataArray->alignDB.get(j).getUnaligned().length() > alignment->getnRows()) {
179 alignment->resize(pDataArray->alignDB.get(j).getUnaligned().length()+1);
182 Sequence seqI(pDataArray->alignDB.get(i).getName(), pDataArray->alignDB.get(i).getAligned());
183 Sequence seqJ(pDataArray->alignDB.get(j).getName(), pDataArray->alignDB.get(j).getAligned());
185 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
186 seqI.setAligned(alignment->getSeqAAln());
187 seqJ.setAligned(alignment->getSeqBAln());
189 distCalculator->calcDist(seqI, seqJ);
190 double dist = distCalculator->getDist();
192 outFile << dist << '\t';
198 pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
202 pDataArray->m->mothurOut(toString(pDataArray->end-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
206 delete distCalculator;
210 catch(exception& e) {
211 pDataArray->m->errorOut(e, "PairwiseSeqsCommand", "MyPairwiseSquareThreadFunction");
216 /**************************************************************************************************/
217 static DWORD WINAPI MyPairwiseThreadFunction(LPVOID lpParam){
218 pairwiseData* pDataArray;
219 pDataArray = (pairwiseData*)lpParam;
222 ofstream outFile((pDataArray->outputFileName).c_str(), ios::trunc);
223 outFile.setf(ios::fixed, ios::showpoint);
224 outFile << setprecision(4);
226 pDataArray->count = pDataArray->end;
228 int startTime = time(NULL);
230 Alignment* alignment;
231 if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); }
232 else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); }
233 else if(pDataArray->align == "blast") { alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch); }
234 else if(pDataArray->align == "noalign") { alignment = new NoAlign(); }
236 pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman.");
237 pDataArray->m->mothurOutEndLine();
238 alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);
241 ValidCalculators validCalculator;
242 Dist* distCalculator;
243 if (pDataArray->countends) {
244 if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) {
245 if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); }
246 else if (pDataArray->distcalcType == "eachgap") { distCalculator = new eachGapDist(); }
247 else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapDist(); }
250 if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) {
251 if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); }
252 else if (pDataArray->distcalcType == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
253 else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
257 if((pDataArray->output == "lt") && pDataArray->start == 0){ outFile << pDataArray->alignDB.getNumSeqs() << endl; }
259 for(int i=pDataArray->start;i<pDataArray->end;i++){
261 if(pDataArray->output == "lt") {
262 string name = pDataArray->alignDB.get(i).getName();
263 if (name.length() < 10) { //pad with spaces to make compatible
264 while (name.length() < 10) { name += " "; }
266 outFile << name << '\t';
270 for(int j=0;j<i;j++){
272 if (pDataArray->m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0; }
274 if (pDataArray->alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
275 alignment->resize(pDataArray->alignDB.get(i).getUnaligned().length()+1);
278 if (pDataArray->alignDB.get(j).getUnaligned().length() > alignment->getnRows()) {
279 alignment->resize(pDataArray->alignDB.get(j).getUnaligned().length()+1);
282 Sequence seqI(pDataArray->alignDB.get(i).getName(), pDataArray->alignDB.get(i).getAligned());
283 Sequence seqJ(pDataArray->alignDB.get(j).getName(), pDataArray->alignDB.get(j).getAligned());
285 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
286 seqI.setAligned(alignment->getSeqAAln());
287 seqJ.setAligned(alignment->getSeqBAln());
289 distCalculator->calcDist(seqI, seqJ);
290 double dist = distCalculator->getDist();
292 if(dist <= pDataArray->cutoff){
293 if (pDataArray->output == "column") { outFile << pDataArray->alignDB.get(i).getName() << ' ' << pDataArray->alignDB.get(j).getName() << ' ' << dist << endl; }
295 if (pDataArray->output == "lt") { outFile << dist << '\t'; }
298 if (pDataArray->output == "lt") { outFile << endl; }
301 pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
305 pDataArray->m->mothurOut(toString(pDataArray->end-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
309 delete distCalculator;
313 catch(exception& e) {
314 pDataArray->m->errorOut(e, "PairwiseSeqsCommand", "MyPairwiseThreadFunction");