]> git.donarmstrong.com Git - mothur.git/blobdiff - trimseqscommand.cpp
added oligos, pdiffs, bdiffs, ldiffs, sdiffs, tiffs parameters to sffinfo to allow...
[mothur.git] / trimseqscommand.cpp
index 9913727dcdc8257816e1f83fab2f8a4b903ff1cd..88ed32bd79bfcdc513ae4c848c775a95e762cd57 100644 (file)
@@ -97,6 +97,29 @@ string TrimSeqsCommand::getHelpString(){
                exit(1);
        }
 }
+//**********************************************************************************************************************
+string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){        
+       try {
+        string outputFileName = "";
+               map<string, vector<string> >::iterator it;
+        
+        //is this a type this command creates
+        it = outputTypes.find(type);
+        if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
+        else {
+            if (type == "qfile")            {   outputFileName =  "qual";   }
+            else if (type == "fasta")            {   outputFileName =  "fasta";   }
+            else if (type == "group")            {   outputFileName =  "groups";   }
+            else if (type == "name")            {   outputFileName =  "names";   }
+            else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
+        }
+        return outputFileName;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimSeqsCommand", "getOutputFileNameTag");
+               exit(1);
+       }
+}
 
 
 //**********************************************************************************************************************
@@ -329,19 +352,21 @@ int TrimSeqsCommand::execute(){
                
                numFPrimers = 0;  //this needs to be initialized
                numRPrimers = 0;
+        numSpacers = 0;
+        numLinkers = 0;
                createGroup = false;
                vector<vector<string> > fastaFileNames;
                vector<vector<string> > qualFileNames;
                vector<vector<string> > nameFileNames;
                
-               string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
+               string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("fasta");
                outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
                
-               string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
+               string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("fasta");
                outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
                
-               string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
-               string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
+               string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("qfile");
+               string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("qfile");
                
                if (qFileName != "") {
                        outputNames.push_back(trimQualFile);
@@ -350,8 +375,8 @@ int TrimSeqsCommand::execute(){
                        outputTypes["qfile"].push_back(scrapQualFile); 
                }
                
-               string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
-               string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
+               string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim." + getOutputFileNameTag("name");
+               string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap." + getOutputFileNameTag("name");
                
                if (nameFile != "") {
                        m->readNames(nameFile, nameMap);
@@ -367,7 +392,7 @@ int TrimSeqsCommand::execute(){
                if(oligoFile != ""){
                        createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
                        if (createGroup) {
-                               outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
+                               outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + getOutputFileNameTag("group");
                                outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
                        }
                }
@@ -426,7 +451,7 @@ int TrimSeqsCommand::execute(){
                                m->openInputFile(it->first, in);
                                
                                ofstream out;
-                               string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
+                               string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + getOutputFileNameTag("group");
                                outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
                                m->openOutputFile(thisGroupName, out);
                                
@@ -435,6 +460,17 @@ int TrimSeqsCommand::execute(){
                                        
                                        Sequence currSeq(in); m->gobble(in);
                                        out << currSeq.getName() << '\t' << it->second << endl;
+                    
+                    if (nameFile != "") {
+                        map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+                        if (itName != nameMap.end()) { 
+                            vector<string> thisSeqsNames; 
+                            m->splitAtChar(itName->second, thisSeqsNames, ',');
+                            for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
+                                out << thisSeqsNames[k] << '\t' << it->second << endl;
+                            }
+                        }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }                                                  
+                    }
                                }
                                in.close();
                                out.close();
@@ -549,7 +585,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                
                int count = 0;
                bool moreSeqs = 1;
-               TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
+               TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
        
                while (moreSeqs) {
                                
@@ -571,9 +607,11 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
 
                        Sequence currSeq(inFASTA); m->gobble(inFASTA);
                        //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
+            
                        QualityScores currQual;
                        if(qFileName != ""){
                                currQual = QualityScores(qFile);  m->gobble(qFile);
+                //cout << currQual.getName() << endl;
                        }
                        
                        string origSeq = currSeq.getUnaligned();
@@ -594,6 +632,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                        if(success > bdiffs)            {       trashCode += 'b';       }
                                        else{ currentSeqsDiffs += success;  }
                                }
+                
+                if(rbarcodes.size() != 0){
+                                       success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
+                                       if(success > bdiffs)            {       trashCode += 'b';       }
+                                       else{ currentSeqsDiffs += success;  }
+                               }
                                
                 if(numSpacers != 0){
                                        success = trimOligos.stripSpacer(currSeq, currQual);
@@ -668,6 +712,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                                currQual.printQScores(trimQualFile);
                                        }
                                        
+                    
                                        if(nameFile != ""){
                                                map<string, string>::iterator itName = nameMap.find(currSeq.getName());
                                                if (itName != nameMap.end()) {  trimNameFile << itName->first << '\t' << itName->second << endl; }
@@ -689,11 +734,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                                        
                                                        outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
                                                        
+                            int numRedundants = 0;
                                                        if (nameFile != "") {
                                                                map<string, string>::iterator itName = nameMap.find(currSeq.getName());
                                                                if (itName != nameMap.end()) { 
                                                                        vector<string> thisSeqsNames; 
                                                                        m->splitAtChar(itName->second, thisSeqsNames, ',');
+                                    numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
                                                                        for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
                                                                                outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
                                                                        }
@@ -701,8 +748,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                                        }
                                                        
                                                        map<string, int>::iterator it = groupCounts.find(thisGroup);
-                                                       if (it == groupCounts.end()) {  groupCounts[thisGroup] = 1; }
-                                                       else { groupCounts[it->first]++; }
+                                                       if (it == groupCounts.end()) {  groupCounts[thisGroup] = 1 + numRedundants; }
+                                                       else { groupCounts[it->first] += (1 + numRedundants); }
                                                                
                                                }
                                        }
@@ -746,7 +793,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                count++;
                        }
                        
-                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                                unsigned long long pos = inFASTA.tellg();
                                if ((pos == -1) || (pos >= line.end)) { break; }
                        
@@ -786,7 +833,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                int exitCommand = 1;
                processIDS.clear();
                
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                                //loop through and create all the processes you want
                while (process != processors) {
                        int pid = fork();
@@ -836,6 +883,8 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                                                                 tempNameFileNames,
                                                                 lines[process],
                                                                 qLines[process]);
+                
+                if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); }
                                
                                //pass groupCounts to parent
                                if(createGroup){
@@ -933,7 +982,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                                               tempPrimerQualFileNames,
                                               tempNameFileNames,
                                               lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
-                                              pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, 
+                                              pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer, 
                                              primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
                                               qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
                                              minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
@@ -1027,7 +1076,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                                }
                        }
                        
-            #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+            #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                        if(createGroup){
                                ifstream in;
                                string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
@@ -1067,7 +1116,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) {
         vector<unsigned long long> fastaFilePos;
                vector<unsigned long long> qfileFilePos;
                
-               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                //set file positions for fasta file
                fastaFilePos = m->divideFile(filename, processors);
                
@@ -1140,6 +1189,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) {
         }
         
         for (int i = 0; i < (fastaFilePos.size()-1); i++) {
+            if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +'\t' + toString(fastaFilePos[i]) + '\t' + toString(fastaFilePos[i+1]) + '\n'); }
                        lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
                        if (qfilename != "") {  qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)]));  }
                }       
@@ -1157,6 +1207,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) {
         }else{
             int numFastaSeqs = 0;
             fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs); 
+            if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
         
             if (qfilename != "") { 
                 int numQualSeqs = 0;
@@ -1173,12 +1224,10 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) {
                 int startIndex =  i * numSeqsPerProcessor;
                 if(i == (processors - 1)){     numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
                 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
-                cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl;
                 if (qfilename != "") {  qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
             }
-        
-            if(qfilename == "")        {       qLines = lines; } //files with duds
         }
+            if(qfilename == "")        {       qLines = lines; } //files with duds
                        return 1;
                
                #endif
@@ -1242,13 +1291,36 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                                        primerNameVector.push_back(group);
                                }
                                else if(type == "REVERSE"){
-                                       Sequence oligoRC("reverse", oligo);
-                                       oligoRC.reverseComplement();
-                                       revPrimer.push_back(oligoRC.getUnaligned());
+                                       //Sequence oligoRC("reverse", oligo);
+                                       //oligoRC.reverseComplement();
+                    string oligoRC = reverseOligo(oligo);
+                                       revPrimer.push_back(oligoRC);
                                }
                                else if(type == "BARCODE"){
                                        inOligos >> group;
+                    
+                    //barcode lines can look like   BARCODE   atgcatgc   groupName  - for 454 seqs
+                    //or                            BARCODE   atgcatgc   atgcatgc    groupName  - for illumina data that has forward and reverse info
+                                       string temp = "";
+                    while (!inOligos.eof())    {       
+                                               char c = inOligos.get(); 
+                                               if (c == 10 || c == 13){        break;  }
+                                               else if (c == 32 || c == 9){;} //space or tab
+                                               else {  temp += c;  }
+                                       } 
                                        
+                    //then this is illumina data with 4 columns
+                    if (temp != "") {  
+                        string reverseBarcode = reverseOligo(group); //reverse barcode
+                        group = temp;
+                        
+                        //check for repeat barcodes
+                        map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
+                        if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine();  }
+                                               
+                        rbarcodes[reverseBarcode]=indexBarcode; 
+                    }
+                        
                                        //check for repeat barcodes
                                        map<string, int>::iterator itBar = barcodes.find(oligo);
                                        if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
@@ -1260,7 +1332,7 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                                }else if(type == "SPACER"){
                                        spacer.push_back(oligo);
                                }
-                               else{   m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
+                               else{   m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
                        }
                        m->gobble(inOligos);
                }       
@@ -1467,6 +1539,46 @@ bool TrimSeqsCommand::cullHomoP(Sequence& seq){
        }
        
 }
+//********************************************************************/
+string TrimSeqsCommand::reverseOligo(string oligo){
+       try {
+        string reverse = "";
+        
+        for(int i=oligo.length()-1;i>=0;i--){
+            
+            if(oligo[i] == 'A')                {       reverse += 'T'; }
+            else if(oligo[i] == 'T'){  reverse += 'A'; }
+            else if(oligo[i] == 'U'){  reverse += 'A'; }
+            
+            else if(oligo[i] == 'G'){  reverse += 'C'; }
+            else if(oligo[i] == 'C'){  reverse += 'G'; }
+            
+            else if(oligo[i] == 'R'){  reverse += 'Y'; }
+            else if(oligo[i] == 'Y'){  reverse += 'R'; }
+            
+            else if(oligo[i] == 'M'){  reverse += 'K'; }
+            else if(oligo[i] == 'K'){  reverse += 'M'; }
+            
+            else if(oligo[i] == 'W'){  reverse += 'W'; }
+            else if(oligo[i] == 'S'){  reverse += 'S'; }
+            
+            else if(oligo[i] == 'B'){  reverse += 'V'; }
+            else if(oligo[i] == 'V'){  reverse += 'B'; }
+            
+            else if(oligo[i] == 'D'){  reverse += 'H'; }
+            else if(oligo[i] == 'H'){  reverse += 'D'; }
+            
+            else                                               {       reverse += 'N'; }
+        }
+        
+        
+        return reverse;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
+               exit(1);
+       }
+}
 
 //***************************************************************************************************************