]> git.donarmstrong.com Git - mothur.git/blobdiff - makecontigscommand.h
added sets to amova and homova commands. added oligos to make.contigs. added metadat...
[mothur.git] / makecontigscommand.h
index 84e43c01c9a0b8c7775c6a46494a6fc0fcb642a6..86a8450afaf76ba3be6a167ce8b856a605f09166 100644 (file)
@@ -29,6 +29,14 @@ struct fastqRead {
        ~fastqRead() {};
 };
 
+struct pairFastqRead {
+       fastqRead forward;
+    fastqRead reverse;
+       
+       pairFastqRead() {};
+       pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){};
+       ~pairFastqRead() {};
+};
 /**************************************************************************************************/
 
 class MakeContigsCommand : public Command {
@@ -50,7 +58,7 @@ public:
     void help() { m->mothurOut(getHelpString()); }     
     
 private:
-    bool abort, allFiles;
+    bool abort, allFiles, createGroup;
     string outputDir, ffastqfile, rfastqfile, align, oligosfile;
        float match, misMatch, gapOpen, gapExtend;
        int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
@@ -63,18 +71,20 @@ private:
        vector<string> primerNameVector;        
        vector<string> barcodeNameVector;       
     
-       map<string, int> groupCounts;  
+       map<string, int> groupCounts; 
+    map<string, string> groupMap;
     //map<string, int> combos;
        //map<string, int> groupToIndex;
     //vector<string> groupVector;
     
-    fastqRead readFastq(ifstream&);
+    fastqRead readFastq(ifstream&, bool&);
     vector< vector<string> > readFastqFiles(int&);
     bool checkReads(fastqRead&, fastqRead&);
-    int createProcesses(vector< vector<string> >, string, string, string);
-    int driver(vector<string>, string, string, string);
+    int createProcesses(vector< vector<string> >, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >);
+    int driver(vector<string>, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >);
     bool getOligos(vector<vector<string> >&, vector<vector<string> >&);
     string reverseOligo(string);
+    vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques);
 };
 
 /**************************************************************************************************/
@@ -86,15 +96,26 @@ private:
 struct contigsData {
        string outputFasta; 
        string outputQual; 
+    string outputScrapFasta; 
+       string outputScrapQual;
        string outputMisMatches;
        string align;
     vector<string> files;
+    vector<vector<string> > fastaFileNames;
+    vector<vector<string> > qualFileNames;
        MothurOut* m;
        float match, misMatch, gapOpen, gapExtend;
-       int count, threshold, threadID;
+       int count, threshold, threadID, pdiffs, bdiffs, tdiffs;
+    bool allFiles, createGroup;
+    map<string, int> groupCounts; 
+    map<string, string> groupMap;
+    vector<string> primerNameVector;   
+       vector<string> barcodeNameVector;
+    map<int, oligosPair> barcodes;
+       map<int, oligosPair> primers;
        
        contigsData(){}
-       contigsData(vector<string> f, string of, string oq, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, int tid) {
+       contigsData(vector<string> f, string of, string oq, string osf, string osq, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, map<int, oligosPair> br, map<int, oligosPair> pr, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<string>bnv, vector<string> pnv, int pdf, int bdf, int tdf, bool cg, bool all, int tid) {
         files = f;
                outputFasta = of;
         outputQual = oq;
@@ -107,6 +128,19 @@ struct contigsData {
         threshold = thr;
                align = al;
                count = 0;
+        outputScrapFasta = osf;
+        outputScrapQual = osq;
+        fastaFileNames = ffn;
+        qualFileNames = qfn;
+        barcodes = br;
+        primers = pr;
+        barcodeNameVector = bnv;
+        primerNameVector = pnv;
+        pdiffs = pdf;
+        bdiffs = bdf;
+        tdiffs = tdf;
+        allFiles = all;
+        createGroup = cg;
                threadID = tid;
        }
 };
@@ -132,32 +166,69 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){
         
         if (pDataArray->m->debug) {  pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); }
         
+               if(pDataArray->allFiles){
+                       for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
+                               for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
+                                       if (pDataArray->fastaFileNames[i][j] != "") {
+                                               ofstream temp;
+                                               pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp);                  temp.close();
+                        pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp);                  temp.close();
+                                       }
+                               }
+                       }
+               }
+        
         ifstream inFFasta, inRFasta, inFQual, inRQual;
         pDataArray->m->openInputFile(thisffastafile, inFFasta);
         pDataArray->m->openInputFile(thisfqualfile, inFQual);
         pDataArray->m->openInputFile(thisrfastafile, inRFasta);
         pDataArray->m->openInputFile(thisrqualfile, inRQual);
         
-        ofstream outFasta, outQual, outMisMatch;
+        ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual;
         pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
         pDataArray->m->openOutputFile(pDataArray->outputQual, outQual);
         pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
+        pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta);
+        pDataArray->m->openOutputFile(pDataArray->outputScrapQual, outScrapQual);
         outMisMatch << "Name\tLength\tMisMatches\n";
         
+        TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->primers, pDataArray->barcodes);
+        
         while ((!inFQual.eof()) && (!inFFasta.eof()) && (!inRFasta.eof()) && (!inRQual.eof())) {
             
             if (pDataArray->m->control_pressed) { break; }
             
+            int success = 1;
+            string trashCode = "";
+            int currentSeqsDiffs = 0;
+            
             //read seqs and quality info
             Sequence fSeq(inFFasta); pDataArray->m->gobble(inFFasta);
             Sequence rSeq(inRFasta); pDataArray->m->gobble(inRFasta);
             QualityScores fQual(inFQual); pDataArray->m->gobble(inFQual);
             QualityScores rQual(inRQual); pDataArray->m->gobble(inRQual);
             
+            int barcodeIndex = 0;
+            int primerIndex = 0;
+            
+            if(pDataArray->barcodes.size() != 0){
+                success = trimOligos.stripBarcode(fSeq, rSeq, fQual, rQual, barcodeIndex);
+                if(success > pDataArray->bdiffs)               {       trashCode += 'b';       }
+                else{ currentSeqsDiffs += success;  }
+            }
+            
+            if(pDataArray->primers.size() != 0){
+                success = trimOligos.stripForward(fSeq, rSeq, fQual, rQual, primerIndex);
+                if(success > pDataArray->pdiffs)               {       trashCode += 'f';       }
+                else{ currentSeqsDiffs += success;  }
+            }
+            
+            if (currentSeqsDiffs > pDataArray->tdiffs) {       trashCode += 't';   }
+            
             //flip the reverse reads
             rSeq.reverseComplement();
             rQual.flipQScores();
-            
+           
             //pairwise align
             alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
             map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
@@ -172,29 +243,51 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){
             int numMismatches = 0;
             string seq1 = fSeq.getAligned();
             string seq2 = rSeq.getAligned();
-            
             vector<int> scores1 = fQual.getQualityScores();
             vector<int> scores2 = rQual.getQualityScores();
             
-            for (int i = 0; i < length; i++) {
+            int overlapStart = fSeq.getStartPos();
+            int seq2Start = rSeq.getStartPos();
+            //bigger of the 2 starting positions is the location of the overlapping start
+            if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
+                overlapStart = seq2Start; 
+                for (int i = 0; i < overlapStart; i++) {
+                    contig += seq1[i];
+                    contigScores.push_back(scores1[ABaseMap[i]]);
+                }
+            }else { //seq1 starts later so take from 0 to overlapStart from seq2
+                for (int i = 0; i < overlapStart; i++) {
+                    contig += seq2[i];
+                    contigScores.push_back(scores2[BBaseMap[i]]);
+                }
+            }
+            
+            int seq1End = fSeq.getEndPos();
+            int seq2End = rSeq.getEndPos();
+            int overlapEnd = seq1End;
+            if (seq2End < overlapEnd) { overlapEnd = seq2End; }  //smallest end position is where overlapping ends
+            
+            for (int i = overlapStart; i < overlapEnd; i++) {
                 if (seq1[i] == seq2[i]) { //match, add base and choose highest score
                     contig += seq1[i];
                     contigScores.push_back(scores1[ABaseMap[i]]);
-                    if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; }
+                    if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; }
                 }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base
-                    if (scores2[BBaseMap[i]] >= pDataArray->threshold) {
+                    if (scores2[BBaseMap[i]] < pDataArray->threshold) { } //
+                    else {
                         contig += seq2[i];
                         contigScores.push_back(scores2[BBaseMap[i]]);
                     }
                 }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base
-                    if (scores1[ABaseMap[i]] >= pDataArray->threshold) { 
+                    if (scores1[ABaseMap[i]] < pDataArray->threshold) { } //
+                    else {
                         contig += seq1[i];
                         contigScores.push_back(scores1[ABaseMap[i]]);
                     }
                 }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
                     char c = seq1[i];
                     contigScores.push_back(scores1[ABaseMap[i]]);
-                    if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; c = seq2[i]; }
+                    if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; }
                     contig += c;
                     numMismatches++;
                 }else { //should never get here
@@ -202,13 +295,70 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){
                 }
             }
             
-            //output
-            outFasta << ">" << fSeq.getName() << endl << contig << endl;
-            outQual << ">" << fSeq.getName() << endl;
-            for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; }
-            outQual << endl;
-            outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl;
-            
+            if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
+                for (int i = overlapEnd; i < length; i++) {
+                    contig += seq2[i];
+                    contigScores.push_back(scores2[BBaseMap[i]]);
+                }
+            }else { //seq2 ends before seq1 so take from overlap to length from seq1
+                for (int i = overlapEnd; i < length; i++) {
+                    contig += seq1[i];
+                    contigScores.push_back(scores1[ABaseMap[i]]);
+                }
+                
+            }
+
+            if(trashCode.length() == 0){
+                if (pDataArray->createGroup) {
+                    if(pDataArray->barcodes.size() != 0){
+                        string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
+                        if (pDataArray->primers.size() != 0) { 
+                            if (pDataArray->primerNameVector[primerIndex] != "") { 
+                                if(thisGroup != "") {
+                                    thisGroup += "." + pDataArray->primerNameVector[primerIndex]; 
+                                }else {
+                                    thisGroup = pDataArray->primerNameVector[primerIndex]; 
+                                }
+                            } 
+                        }
+                        
+                        if (pDataArray->m->debug) { pDataArray->m->mothurOut(", group= " + thisGroup + "\n"); }
+                        
+                        pDataArray->groupMap[fSeq.getName()] = thisGroup; 
+                        
+                        map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
+                        if (it == pDataArray->groupCounts.end()) {     pDataArray->groupCounts[thisGroup] = 1; }
+                        else { pDataArray->groupCounts[it->first] ++; }
+                        
+                    }
+                }
+                
+                if(pDataArray->allFiles){
+                    ofstream output;
+                    pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
+                    output << ">" << fSeq.getName() << endl << contig << endl;
+                    output.close();
+                    
+                    pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
+                    output << ">" << fSeq.getName() << endl;
+                    for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; }
+                    output << endl;
+                    output.close();                                                    
+                }
+                
+                //output
+                outFasta << ">" << fSeq.getName() << endl << contig << endl;
+                outQual << ">" << fSeq.getName() << endl;
+                for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; }
+                outQual << endl;
+                outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl;
+            }else {
+                //output
+                outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
+                outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl;
+                for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; }
+                outScrapQual << endl;
+            }
             num++;
             
                        //report progress
@@ -225,9 +375,11 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){
         outFasta.close();
         outQual.close();
         outMisMatch.close();
+        outScrapFasta.close();
+        outScrapQual.close();
         delete alignment;
         
-        if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputQual); pDataArray->m->mothurRemove(pDataArray->outputFasta);  pDataArray->m->mothurRemove(pDataArray->outputMisMatches);}
+        if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputQual); pDataArray->m->mothurRemove(pDataArray->outputFasta);  pDataArray->m->mothurRemove(pDataArray->outputMisMatches); pDataArray->m->mothurRemove(pDataArray->outputScrapQual); pDataArray->m->mothurRemove(pDataArray->outputScrapFasta);}
         
         return 0;