]> git.donarmstrong.com Git - mothur.git/blobdiff - pcrseqscommand.h
Revert to previous commit
[mothur.git] / pcrseqscommand.h
diff --git a/pcrseqscommand.h b/pcrseqscommand.h
new file mode 100644 (file)
index 0000000..420a5eb
--- /dev/null
@@ -0,0 +1,385 @@
+#ifndef Mothur_pcrseqscommand_h
+#define Mothur_pcrseqscommand_h
+
+//
+//  pcrseqscommand.h
+//  Mothur
+//
+//  Created by Sarah Westcott on 3/14/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+
+#include "command.hpp"
+#include "sequence.hpp"
+#include "trimoligos.h"
+#include "alignment.hpp"
+#include "needlemanoverlap.hpp"
+
+class PcrSeqsCommand : public Command {
+public:
+       PcrSeqsCommand(string);
+       PcrSeqsCommand();
+       ~PcrSeqsCommand(){}
+       
+       vector<string> setParameters();
+       string getCommandName()                 { return "pcr.seqs";    }
+       string getCommandCategory()             { return "Sequence Processing";         }
+       string getHelpString(); 
+       string getCitation() { return "http://www.mothur.org/wiki/Pcr.seqs"; }
+       string getDescription()         { return "pcr.seqs"; }
+    
+       int execute(); 
+       void help() { m->mothurOut(getHelpString()); }  
+       
+private:
+    
+    struct linePair {
+        unsigned long long start;
+        unsigned long long end;
+        linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
+        linePair() {}
+    };
+    
+    vector<linePair> lines;
+       bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
+    bool abort, keepprimer, keepdots;
+       string fastafile, oligosfile, taxfile, groupfile, namefile, ecolifile, outputDir, nomatch;
+       int start, end, processors, length;
+       
+    vector<string> revPrimer, outputNames;
+       vector<string> primers;
+    
+    int writeAccnos(set<string>);
+    int readName(set<string>&);
+    int readGroup(set<string>);
+    int readTax(set<string>);
+    bool readOligos();
+    bool readEcoli();
+       int driverPcr(string, string, string, set<string>&, linePair);  
+       int createProcesses(string, string, string, set<string>&);
+    bool findForward(Sequence&, int&, int&);
+    bool findReverse(Sequence&, int&, int&);
+    bool isAligned(string, map<int, int>&);
+    bool compareDNASeq(string, string);
+    string reverseOligo(string);
+};
+
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct pcrData {
+       string filename; 
+    string goodFasta, badFasta, oligosfile, ecolifile, nomatch;
+       unsigned long long fstart;
+       unsigned long long fend;
+       int count, start, end, length;
+       MothurOut* m;
+       vector<string> primers;
+    vector<string> revPrimer;
+    set<string> badSeqNames;
+    bool keepprimer, keepdots;
+       
+       
+       pcrData(){}
+       pcrData(string f, string gf, string bfn, MothurOut* mout, string ol, string ec, vector<string> pr, vector<string> rpr, string nm, bool kp, bool kd, int st, int en, int l, unsigned long long fst, unsigned long long fen) {
+               filename = f;
+        goodFasta = gf;
+        badFasta = bfn;
+               m = mout;
+        oligosfile = ol;
+        ecolifile = ec;
+        primers = pr;
+        revPrimer = rpr;
+        nomatch = nm;
+        keepprimer = kp;
+        keepdots = kd;
+               start = st;
+               end = en;
+        length = l;
+               fstart = fst;
+        fend = fen;
+               count = 0;
+       }
+};
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ 
+       pcrData* pDataArray;
+       pDataArray = (pcrData*)lpParam;
+       
+       try {
+        ofstream goodFile;
+               pDataArray->m->openOutputFile(pDataArray->goodFasta, goodFile);
+        
+        ofstream badFile;
+               pDataArray->m->openOutputFile(pDataArray->badFasta, badFile);
+               
+               ifstream inFASTA;
+               pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
+        
+               //print header if you are process 0
+               if ((pDataArray->fstart == 0) || (pDataArray->fstart == 1)) {
+                       inFASTA.seekg(0);
+               }else { //this accounts for the difference in line endings. 
+                       inFASTA.seekg(pDataArray->fstart-1); pDataArray->m->gobble(inFASTA); 
+               }
+        
+        set<int> lengths;
+               pDataArray->count = pDataArray->fend;
+               for(int i = 0; i < pDataArray->fend; i++){ //end is the number of sequences to process
+            
+                       if (pDataArray->m->control_pressed) {  break; }
+                       
+                       Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
+          
+            string trashCode = "";
+                       if (currSeq.getName() != "") {
+                
+                bool goodSeq = true;
+                if (pDataArray->oligosfile != "") {
+                    map<int, int> mapAligned;
+                    //bool aligned = isAligned(currSeq.getAligned(), mapAligned);
+                    ///////////////////////////////////////////////////////////////
+                    bool aligned = false;
+                    string seq = currSeq.getAligned(); 
+                    int countBases = 0;
+                    for (int k = 0; k < seq.length(); k++) {
+                        if (!isalpha(seq[k])) { aligned = true; }
+                        else { mapAligned[countBases] = k; countBases++; } //maps location in unaligned -> location in aligned.
+                    }                                                   //ie. the 3rd base may be at spot 10 in the alignment
+                                                                        //later when we trim we want to trim from spot 10.
+                    ///////////////////////////////////////////////////////////////
+                    
+                    //process primers
+                    if (pDataArray->primers.size() != 0) {
+                        int primerStart = 0; int primerEnd = 0;
+                        //bool good = findForward(currSeq, primerStart, primerEnd);
+                        ///////////////////////////////////////////////////////////////
+                        bool good = false;
+                        string rawSequence = currSeq.getUnaligned();
+                        
+                        for(int j=0;j<pDataArray->primers.size();j++){
+                            string oligo = pDataArray->primers[j];
+                            
+                            if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
+                            
+                            if(rawSequence.length() < oligo.length()) {  break;  }
+                            
+                            //search for primer
+                            int olength = oligo.length();
+                            for (int l = 0; l < rawSequence.length()-olength; l++){
+                                if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
+                                string rawChunk = rawSequence.substr(l, olength);
+                                //compareDNASeq(oligo, rawChunk)
+                                ////////////////////////////////////////////////////////
+                                bool success = 1;
+                                for(int k=0;k<olength;k++){
+                                    
+                                    if(oligo[k] != rawChunk[k]){
+                                        if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C')   {       success = 0;    }
+                                        else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N'))                          {       success = 0;    }
+                                        else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G'))                                 {       success = 0;    }
+                                        else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T'))                                 {       success = 0;    }
+                                        else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A'))                                 {       success = 0;    }
+                                        else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G'))                                 {       success = 0;    }
+                                        else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A'))                                 {       success = 0;    }
+                                        else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G'))                                 {       success = 0;    }
+                                        else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))   {       success = 0;    }
+                                        else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))   {       success = 0;    }
+                                        else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C'))   {       success = 0;    }
+                                        else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G'))   {       success = 0;    }                       
+                                        
+                                        if(success == 0)       {       break;   }
+                                    }
+                                    else{
+                                        success = 1;
+                                    }
+                                }
+                                
+                                ////////////////////////////////////////////////////////////////////
+                                if(success) {
+                                    primerStart = j;
+                                    primerEnd = primerStart + olength;
+                                    good = true; break;
+                                }
+                            }
+                            if (good) { break; }
+                        }      
+                        
+                        if (!good) { primerStart = 0; primerEnd = 0; }
+                        ///////////////////////////////////////////////////////////////
+                        
+                        
+                        if(!good){     if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "f";     }
+                        else{
+                            //are you aligned
+                            if (aligned) { 
+                                if (!pDataArray->keepprimer)    {  
+                                    if (pDataArray->keepdots)   { currSeq.filterToPos(mapAligned[primerEnd]);   }
+                                    else            { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd]));                                              }
+                                } 
+                                else                {  
+                                    if (pDataArray->keepdots)   { currSeq.filterToPos(mapAligned[primerStart]);  }
+                                    else            { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart]));                                              }
+                                }
+                            }else { 
+                                if (!pDataArray->keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } 
+                                else                { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } 
+                            }
+                        }
+                    }
+                    
+                    //process reverse primers
+                    if (pDataArray->revPrimer.size() != 0) {
+                        int primerStart = 0; int primerEnd = 0;
+                        bool good = false;
+                        //findReverse(currSeq, primerStart, primerEnd);
+                         ///////////////////////////////////////////////////////////////
+                        string rawSequence = currSeq.getUnaligned();
+                        
+                        for(int j=0;j<pDataArray->revPrimer.size();j++){
+                            string oligo = pDataArray->revPrimer[j];
+                            if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
+                            if(rawSequence.length() < oligo.length()) {  break;  }
+                            
+                            //search for primer
+                            int olength = oligo.length();
+                            for (int l = rawSequence.length()-olength; l >= 0; l--){
+                                
+                                string rawChunk = rawSequence.substr(l, olength);
+                                //compareDNASeq(oligo, rawChunk)
+                                ////////////////////////////////////////////////////////
+                                bool success = 1;
+                                for(int k=0;k<olength;k++){
+                                    
+                                    if(oligo[k] != rawChunk[k]){
+                                        if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C')   {       success = 0;    }
+                                        else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N'))                          {       success = 0;    }
+                                        else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G'))                                 {       success = 0;    }
+                                        else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T'))                                 {       success = 0;    }
+                                        else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A'))                                 {       success = 0;    }
+                                        else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G'))                                 {       success = 0;    }
+                                        else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A'))                                 {       success = 0;    }
+                                        else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G'))                                 {       success = 0;    }
+                                        else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))   {       success = 0;    }
+                                        else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))   {       success = 0;    }
+                                        else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C'))   {       success = 0;    }
+                                        else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G'))   {       success = 0;    }                       
+                                        
+                                        if(success == 0)       {       break;   }
+                                    }
+                                    else{
+                                        success = 1;
+                                    }
+                                }
+                                
+                                ////////////////////////////////////////////////////////////////////
+                                if(success) {
+                                    primerStart = j;
+                                    primerEnd = primerStart + olength;
+                                    good = true; break;
+                                }
+                            }
+                            if (good) { break; }
+                        }      
+                        
+                        if (!good) { primerStart = 0; primerEnd = 0; }
+
+                         ///////////////////////////////////////////////////////////////
+                        if(!good){     if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "r";     }
+                        else{ 
+                            //are you aligned
+                            if (aligned) { 
+                                if (!pDataArray->keepprimer)    {  
+                                    if (pDataArray->keepdots)   { currSeq.filterFromPos(mapAligned[primerStart]); }
+                                    else            { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart]));   }
+                                } 
+                                else                {  
+                                    if (pDataArray->keepdots)   { currSeq.filterFromPos(mapAligned[primerEnd]); }
+                                    else            { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd]));   }
+                                }                             }
+                            else { 
+                                if (!pDataArray->keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart));   } 
+                                else                { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd));     }
+                            }
+                        }
+                    }
+                }else if (pDataArray->ecolifile != "") {
+                    //make sure the seqs are aligned
+                    lengths.insert(currSeq.getAligned().length());
+                    if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
+                    else if (currSeq.getAligned().length() != pDataArray->length) {
+                        pDataArray->m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); pDataArray->m->control_pressed = true; break; 
+                    }else {
+                        if (pDataArray->keepdots)   { 
+                            currSeq.filterToPos(pDataArray->start); 
+                            currSeq.filterFromPos(pDataArray->end);
+                        }else {
+                            string seqString = currSeq.getAligned().substr(0, pDataArray->end);
+                            seqString = seqString.substr(pDataArray->start);
+                            currSeq.setAligned(seqString); 
+                        }
+                    }
+                }else{ //using start and end to trim
+                    //make sure the seqs are aligned
+                    lengths.insert(currSeq.getAligned().length());
+                    if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
+                    else {
+                        if (pDataArray->end != -1) {
+                            if (pDataArray->end > currSeq.getAligned().length()) {  pDataArray->m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); pDataArray->m->control_pressed = true; break; }
+                            else {
+                                if (pDataArray->keepdots)   { currSeq.filterFromPos(pDataArray->end); }
+                                else {
+                                    string seqString = currSeq.getAligned().substr(0, pDataArray->end);
+                                    currSeq.setAligned(seqString); 
+                                }
+                            }
+                        }
+                        if (pDataArray->start != -1) { 
+                            if (pDataArray->keepdots)   {  currSeq.filterToPos(pDataArray->start);  }
+                            else {
+                                string seqString = currSeq.getAligned().substr(pDataArray->start);
+                                currSeq.setAligned(seqString); 
+                            }
+                        }
+                        
+                    }
+                }
+                
+                               if(goodSeq == 1)    {   currSeq.printSequence(goodFile);        }
+                               else {  
+                    pDataArray->badSeqNames.insert(currSeq.getName()); 
+                    currSeq.setName(currSeq.getName() + '|' + trashCode);
+                    currSeq.printSequence(badFile); 
+                }
+                       }
+                                               
+                       //report progress
+                       if((i+1) % 100 == 0){   pDataArray->m->mothurOut("Processing sequence: " + toString(i+1)); pDataArray->m->mothurOutEndLine();           }
+               }
+               //report progress
+               if((pDataArray->count) % 100 != 0){     pDataArray->m->mothurOut("Thread Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();              }
+               
+               goodFile.close();
+               inFASTA.close();
+        badFile.close();
+        
+        return 0;
+               
+       }
+       catch(exception& e) {
+               pDataArray->m->errorOut(e, "PcrSeqsCommand", "MyPcrThreadFunction");
+               exit(1);
+       }
+} 
+
+#endif
+
+/**************************************************************************************************/
+
+
+
+#endif