]> git.donarmstrong.com Git - mothur.git/blobdiff - trimseqscommand.cpp
changed reading of name file to use buffered reads. note the splitAtWhiteSpace functi...
[mothur.git] / trimseqscommand.cpp
index 9913727dcdc8257816e1f83fab2f8a4b903ff1cd..6f5bb979a4d45b9fc3d8201848aa9f6b2d2dddb7 100644 (file)
@@ -329,6 +329,8 @@ int TrimSeqsCommand::execute(){
                
                numFPrimers = 0;  //this needs to be initialized
                numRPrimers = 0;
+        numSpacers = 0;
+        numLinkers = 0;
                createGroup = false;
                vector<vector<string> > fastaFileNames;
                vector<vector<string> > qualFileNames;
@@ -435,6 +437,17 @@ int TrimSeqsCommand::execute(){
                                        
                                        Sequence currSeq(in); m->gobble(in);
                                        out << currSeq.getName() << '\t' << it->second << endl;
+                    
+                    if (nameFile != "") {
+                        map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+                        if (itName != nameMap.end()) { 
+                            vector<string> thisSeqsNames; 
+                            m->splitAtChar(itName->second, thisSeqsNames, ',');
+                            for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
+                                out << thisSeqsNames[k] << '\t' << it->second << endl;
+                            }
+                        }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }                                                  
+                    }
                                }
                                in.close();
                                out.close();
@@ -549,7 +562,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                
                int count = 0;
                bool moreSeqs = 1;
-               TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
+               TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
        
                while (moreSeqs) {
                                
@@ -594,6 +607,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                        if(success > bdiffs)            {       trashCode += 'b';       }
                                        else{ currentSeqsDiffs += success;  }
                                }
+                
+                if(rbarcodes.size() != 0){
+                                       success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
+                                       if(success > bdiffs)            {       trashCode += 'b';       }
+                                       else{ currentSeqsDiffs += success;  }
+                               }
                                
                 if(numSpacers != 0){
                                        success = trimOligos.stripSpacer(currSeq, currQual);
@@ -668,6 +687,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                                currQual.printQScores(trimQualFile);
                                        }
                                        
+                    
                                        if(nameFile != ""){
                                                map<string, string>::iterator itName = nameMap.find(currSeq.getName());
                                                if (itName != nameMap.end()) {  trimNameFile << itName->first << '\t' << itName->second << endl; }
@@ -689,11 +709,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                                        
                                                        outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
                                                        
+                            int numRedundants = 0;
                                                        if (nameFile != "") {
                                                                map<string, string>::iterator itName = nameMap.find(currSeq.getName());
                                                                if (itName != nameMap.end()) { 
                                                                        vector<string> thisSeqsNames; 
                                                                        m->splitAtChar(itName->second, thisSeqsNames, ',');
+                                    numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
                                                                        for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
                                                                                outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
                                                                        }
@@ -701,8 +723,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                                        }
                                                        
                                                        map<string, int>::iterator it = groupCounts.find(thisGroup);
-                                                       if (it == groupCounts.end()) {  groupCounts[thisGroup] = 1; }
-                                                       else { groupCounts[it->first]++; }
+                                                       if (it == groupCounts.end()) {  groupCounts[thisGroup] = 1 + numRedundants; }
+                                                       else { groupCounts[it->first] += (1 + numRedundants); }
                                                                
                                                }
                                        }
@@ -746,7 +768,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                count++;
                        }
                        
-                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                                unsigned long long pos = inFASTA.tellg();
                                if ((pos == -1) || (pos >= line.end)) { break; }
                        
@@ -786,7 +808,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                int exitCommand = 1;
                processIDS.clear();
                
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                                //loop through and create all the processes you want
                while (process != processors) {
                        int pid = fork();
@@ -933,7 +955,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                                               tempPrimerQualFileNames,
                                               tempNameFileNames,
                                               lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
-                                              pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, 
+                                              pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer, 
                                              primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
                                               qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
                                              minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
@@ -1027,7 +1049,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                                }
                        }
                        
-            #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+            #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                        if(createGroup){
                                ifstream in;
                                string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
@@ -1067,7 +1089,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) {
         vector<unsigned long long> fastaFilePos;
                vector<unsigned long long> qfileFilePos;
                
-               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                //set file positions for fasta file
                fastaFilePos = m->divideFile(filename, processors);
                
@@ -1157,6 +1179,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) {
         }else{
             int numFastaSeqs = 0;
             fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs); 
+            if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
         
             if (qfilename != "") { 
                 int numQualSeqs = 0;
@@ -1173,12 +1196,10 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) {
                 int startIndex =  i * numSeqsPerProcessor;
                 if(i == (processors - 1)){     numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
                 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
-                cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl;
                 if (qfilename != "") {  qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
             }
-        
-            if(qfilename == "")        {       qLines = lines; } //files with duds
         }
+            if(qfilename == "")        {       qLines = lines; } //files with duds
                        return 1;
                
                #endif
@@ -1242,13 +1263,36 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                                        primerNameVector.push_back(group);
                                }
                                else if(type == "REVERSE"){
-                                       Sequence oligoRC("reverse", oligo);
-                                       oligoRC.reverseComplement();
-                                       revPrimer.push_back(oligoRC.getUnaligned());
+                                       //Sequence oligoRC("reverse", oligo);
+                                       //oligoRC.reverseComplement();
+                    string oligoRC = reverseOligo(oligo);
+                                       revPrimer.push_back(oligoRC);
                                }
                                else if(type == "BARCODE"){
                                        inOligos >> group;
+                    
+                    //barcode lines can look like   BARCODE   atgcatgc   groupName  - for 454 seqs
+                    //or                            BARCODE   atgcatgc   atgcatgc    groupName  - for illumina data that has forward and reverse info
+                                       string temp = "";
+                    while (!inOligos.eof())    {       
+                                               char c = inOligos.get(); 
+                                               if (c == 10 || c == 13){        break;  }
+                                               else if (c == 32 || c == 9){;} //space or tab
+                                               else {  temp += c;  }
+                                       } 
                                        
+                    //then this is illumina data with 4 columns
+                    if (temp != "") {  
+                        string reverseBarcode = reverseOligo(group); //reverse barcode
+                        group = temp;
+                        
+                        //check for repeat barcodes
+                        map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
+                        if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine();  }
+                                               
+                        rbarcodes[reverseBarcode]=indexBarcode; 
+                    }
+                        
                                        //check for repeat barcodes
                                        map<string, int>::iterator itBar = barcodes.find(oligo);
                                        if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
@@ -1467,6 +1511,46 @@ bool TrimSeqsCommand::cullHomoP(Sequence& seq){
        }
        
 }
+//********************************************************************/
+string TrimSeqsCommand::reverseOligo(string oligo){
+       try {
+        string reverse = "";
+        
+        for(int i=oligo.length()-1;i>=0;i--){
+            
+            if(oligo[i] == 'A')                {       reverse += 'T'; }
+            else if(oligo[i] == 'T'){  reverse += 'A'; }
+            else if(oligo[i] == 'U'){  reverse += 'A'; }
+            
+            else if(oligo[i] == 'G'){  reverse += 'C'; }
+            else if(oligo[i] == 'C'){  reverse += 'G'; }
+            
+            else if(oligo[i] == 'R'){  reverse += 'Y'; }
+            else if(oligo[i] == 'Y'){  reverse += 'R'; }
+            
+            else if(oligo[i] == 'M'){  reverse += 'K'; }
+            else if(oligo[i] == 'K'){  reverse += 'M'; }
+            
+            else if(oligo[i] == 'W'){  reverse += 'W'; }
+            else if(oligo[i] == 'S'){  reverse += 'S'; }
+            
+            else if(oligo[i] == 'B'){  reverse += 'V'; }
+            else if(oligo[i] == 'V'){  reverse += 'B'; }
+            
+            else if(oligo[i] == 'D'){  reverse += 'H'; }
+            else if(oligo[i] == 'H'){  reverse += 'D'; }
+            
+            else                                               {       reverse += 'N'; }
+        }
+        
+        
+        return reverse;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
+               exit(1);
+       }
+}
 
 //***************************************************************************************************************