X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=pairwiseseqscommand.h;h=defccb5227104798667b49c879617bf9d1811200;hp=69785186caa063419a06a7dc73054344ebcfeae9;hb=1a20e24ee786195ab0e1cccd4f5aede7a88f3f4e;hpb=e150b0b0664caec517485ee6d69dcdade6dcae77 diff --git a/pairwiseseqscommand.h b/pairwiseseqscommand.h index 6978518..defccb5 100644 --- a/pairwiseseqscommand.h +++ b/pairwiseseqscommand.h @@ -17,6 +17,18 @@ #include "validcalculator.h" #include "dist.h" #include "sequencedb.h" +#include "sequence.hpp" + +#include "gotohoverlap.hpp" +#include "needlemanoverlap.hpp" +#include "blastalign.hpp" +#include "noalign.hpp" + +#include "ignoregaps.h" +#include "eachgapdist.h" +#include "eachgapignore.h" +#include "onegapdist.h" +#include "onegapignore.h" class PairwiseSeqsCommand : public Command { @@ -28,9 +40,12 @@ public: vector setParameters(); string getCommandName() { return "pairwise.seqs"; } string getCommandCategory() { return "Sequence Processing"; } - string getHelpString(); - string getCitation() { return "http://www.mothur.org/wiki/Pairwise.seqs"; } + string getHelpString(); + string getOutputPattern(string); + string getCitation() { return "Needleman SB, Wunsch CD (1970). A general method applicable to the search for similarities in the amino acid sequence of two proteins. J Mol Biol 48: 443-53. [ for needleman ]\nGotoh O (1982). An improved algorithm for matching biological sequences. J Mol Biol 162: 705-8. [ for gotoh ] \nhttp://www.mothur.org/wiki/Pairwise.seqs"; } + string getDescription() { return "calculates pairwise distances from an unaligned fasta file"; } + int execute(); void help() { m->mothurOut(getHelpString()); } @@ -43,8 +58,6 @@ private: vector processIDS; //end line, processid vector lines; - Alignment* alignment; - Dist* distCalculator; SequenceDB alignDB; void createProcesses(string); @@ -53,18 +66,262 @@ private: #ifdef USE_MPI int driverMPI(int, int, MPI_File&, float); - int driverMPI(int, int, string, unsigned long int&); - int driverMPI(int, int, string, unsigned long int&, string); + int driverMPI(int, int, string, unsigned long long&); + int driverMPI(int, int, string, unsigned long long&, string); #endif string fastaFileName, align, calc, outputDir, output; float match, misMatch, gapOpen, gapExtend, cutoff; - int processors; + int processors, longestBase; vector fastaFileNames, Estimators; vector outputNames; bool abort, countends, compress; }; +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct pairwiseData { + string outputFileName; + string align, square, distcalcType, output; + unsigned long long start; + unsigned long long end; + MothurOut* m; + float match, misMatch, gapOpen, gapExtend, cutoff; + int count, threadID, longestBase; + bool countends; + SequenceDB alignDB; + + pairwiseData(){} + pairwiseData(string ofn, string al, string sq, string di, bool co, string op, SequenceDB DB, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int thr, float cu, int tid) { + outputFileName = ofn; + m = mout; + start = st; + end = en; + match = ma; + misMatch = misMa; + gapOpen = gapO; + gapExtend = gapE; + longestBase = thr; + align = al; + square = sq; + distcalcType = di; + countends = co; + alignDB = DB; + count = 0; + output = op; + cutoff = cu; + threadID = tid; + } +}; + +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else +static DWORD WINAPI MyPairwiseSquareThreadFunction(LPVOID lpParam){ + pairwiseData* pDataArray; + pDataArray = (pairwiseData*)lpParam; + + try { + ofstream outFile((pDataArray->outputFileName).c_str(), ios::trunc); + outFile.setf(ios::fixed, ios::showpoint); + outFile << setprecision(4); + + pDataArray->count = 0; + + int startTime = time(NULL); + + Alignment* alignment; + if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); } + else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); } + else if(pDataArray->align == "blast") { alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch); } + else if(pDataArray->align == "noalign") { alignment = new NoAlign(); } + else { + pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman."); + pDataArray->m->mothurOutEndLine(); + alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); + } + + ValidCalculators validCalculator; + Dist* distCalculator; + if (pDataArray->countends) { + if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) { + if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); } + else if (pDataArray->distcalcType == "eachgap") { distCalculator = new eachGapDist(); } + else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapDist(); } + } + }else { + if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) { + if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); } + else if (pDataArray->distcalcType == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); } + else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); } + } + } + + if(pDataArray->start == 0){ outFile << pDataArray->alignDB.getNumSeqs() << endl; } + + for(int i=pDataArray->start;iend;i++){ + pDataArray->count++; + + string name = pDataArray->alignDB.get(i).getName(); + //pad with spaces to make compatible + if (name.length() < 10) { while (name.length() < 10) { name += " "; } } + + outFile << name << '\t'; + + for(int j=0;jalignDB.getNumSeqs();j++){ + + if (pDataArray->m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0; } + + if (pDataArray->alignDB.get(i).getUnaligned().length() > alignment->getnRows()) { + alignment->resize(pDataArray->alignDB.get(i).getUnaligned().length()+1); + } + + if (pDataArray->alignDB.get(j).getUnaligned().length() > alignment->getnRows()) { + alignment->resize(pDataArray->alignDB.get(j).getUnaligned().length()+1); + } + + Sequence seqI(pDataArray->alignDB.get(i).getName(), pDataArray->alignDB.get(i).getAligned()); + Sequence seqJ(pDataArray->alignDB.get(j).getName(), pDataArray->alignDB.get(j).getAligned()); + + alignment->align(seqI.getUnaligned(), seqJ.getUnaligned()); + seqI.setAligned(alignment->getSeqAAln()); + seqJ.setAligned(alignment->getSeqBAln()); + + distCalculator->calcDist(seqI, seqJ); + double dist = distCalculator->getDist(); + + outFile << dist << '\t'; + } + + outFile << endl; + + if(i % 100 == 0){ + pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine(); + } + + } + pDataArray->m->mothurOut(toString(pDataArray->count) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine(); + + outFile.close(); + delete alignment; + delete distCalculator; + + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "PairwiseSeqsCommand", "MyPairwiseSquareThreadFunction"); + exit(1); + } +} + +/**************************************************************************************************/ +static DWORD WINAPI MyPairwiseThreadFunction(LPVOID lpParam){ + pairwiseData* pDataArray; + pDataArray = (pairwiseData*)lpParam; + + try { + ofstream outFile((pDataArray->outputFileName).c_str(), ios::trunc); + outFile.setf(ios::fixed, ios::showpoint); + outFile << setprecision(4); + + int startTime = time(NULL); + + Alignment* alignment; + if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); } + else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); } + else if(pDataArray->align == "blast") { alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch); } + else if(pDataArray->align == "noalign") { alignment = new NoAlign(); } + else { + pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman."); + pDataArray->m->mothurOutEndLine(); + alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); + } + + ValidCalculators validCalculator; + Dist* distCalculator; + if (pDataArray->countends) { + if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) { + if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); } + else if (pDataArray->distcalcType == "eachgap") { distCalculator = new eachGapDist(); } + else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapDist(); } + } + }else { + if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) { + if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); } + else if (pDataArray->distcalcType == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); } + else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); } + } + } + + if((pDataArray->output == "lt") && pDataArray->start == 0){ outFile << pDataArray->alignDB.getNumSeqs() << endl; } + + pDataArray->count = 0; + for(int i=pDataArray->start;iend;i++){ + pDataArray->count++; + + if(pDataArray->output == "lt") { + string name = pDataArray->alignDB.get(i).getName(); + if (name.length() < 10) { //pad with spaces to make compatible + while (name.length() < 10) { name += " "; } + } + outFile << name << '\t'; + } + + + for(int j=0;jm->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0; } + + if (pDataArray->alignDB.get(i).getUnaligned().length() > alignment->getnRows()) { + alignment->resize(pDataArray->alignDB.get(i).getUnaligned().length()+1); + } + + if (pDataArray->alignDB.get(j).getUnaligned().length() > alignment->getnRows()) { + alignment->resize(pDataArray->alignDB.get(j).getUnaligned().length()+1); + } + + Sequence seqI(pDataArray->alignDB.get(i).getName(), pDataArray->alignDB.get(i).getAligned()); + Sequence seqJ(pDataArray->alignDB.get(j).getName(), pDataArray->alignDB.get(j).getAligned()); + + alignment->align(seqI.getUnaligned(), seqJ.getUnaligned()); + seqI.setAligned(alignment->getSeqAAln()); + seqJ.setAligned(alignment->getSeqBAln()); + + distCalculator->calcDist(seqI, seqJ); + double dist = distCalculator->getDist(); + + if(dist <= pDataArray->cutoff){ + if (pDataArray->output == "column") { outFile << pDataArray->alignDB.get(i).getName() << ' ' << pDataArray->alignDB.get(j).getName() << ' ' << dist << endl; } + } + if (pDataArray->output == "lt") { outFile << dist << '\t'; } + } + + if (pDataArray->output == "lt") { outFile << endl; } + + if(i % 100 == 0){ + pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine(); + } + + } + pDataArray->m->mothurOut(toString(pDataArray->end-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine(); + + outFile.close(); + delete alignment; + delete distCalculator; + + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "PairwiseSeqsCommand", "MyPairwiseThreadFunction"); + exit(1); + } +} + +#endif + + #endif