]> git.donarmstrong.com Git - mothur.git/blobdiff - sffinfocommand.cpp
added oligos class. added check orient parameter to trim.flows, sffinfo, fastq.info...
[mothur.git] / sffinfocommand.cpp
index 26d48320e685d28e94e8e5dc417e875484710b13..dc8eb54abb749bffe4d0ffab55e53331f1933b32 100755 (executable)
@@ -18,6 +18,7 @@ vector<string> SffInfoCommand::setParameters(){
        try {           \r
                CommandParameter psff("sff", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(psff);\r
         CommandParameter poligos("oligos", "InputTypes", "", "", "oligosGroup", "none", "none","",false,false); parameters.push_back(poligos);\r
+        CommandParameter preorient("checkorient", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(preorient);\r
         CommandParameter pgroup("group", "InputTypes", "", "", "oligosGroup", "none", "none","",false,false); parameters.push_back(pgroup);\r
                CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(paccnos);\r
                CommandParameter psfftxt("sfftxt", "String", "", "", "", "", "","",false,false); parameters.push_back(psfftxt);\r
@@ -47,7 +48,7 @@ string SffInfoCommand::getHelpString(){
        try {\r
                string helpString = "";\r
                helpString += "The sffinfo command reads a sff file and extracts the sequence data, or you can use it to parse a sfftxt file.\n";\r
-               helpString += "The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, oligos, group, bdiffs, tdiffs, ldiffs, sdiffs, pdiffs and trim. sff is required. \n";\r
+               helpString += "The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, oligos, group, bdiffs, tdiffs, ldiffs, sdiffs, pdiffs, checkorient and trim. sff is required. \n";\r
                helpString += "The sff parameter allows you to enter the sff file you would like to extract data from.  You may enter multiple files by separating them by -'s.\n";\r
                helpString += "The fasta parameter allows you to indicate if you would like a fasta formatted file generated.  Default=True. \n";\r
                helpString += "The qfile parameter allows you to indicate if you would like a quality file generated.  Default=True. \n";\r
@@ -58,6 +59,7 @@ string SffInfoCommand::getHelpString(){
                helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";\r
         helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";\r
                helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";\r
+        helpString += "The checkorient parameter will check look for the reverse compliment of the barcode or primer in the sequence. The default is false.\n";\r
                helpString += "The flow parameter allows you to indicate if you would like a flowgram file generated.  Default=True. \n";\r
                helpString += "The sfftxt parameter allows you to indicate if you would like a sff.txt file generated.  Default=False. \n";\r
                helpString += "If you want to parse an existing sfftxt file into flow, fasta and quality file, enter the file name using the sfftxt parameter. \n";\r
@@ -492,6 +494,8 @@ SffInfoCommand::SffInfoCommand(string option)  {
                                else {  m->mothurOut("[ERROR]: you must provide a valid sff or sfftxt file."); m->mothurOutEndLine(); abort=true;  }\r
                        }\r
             \r
+            temp = validParameter.validFile(parameters, "checkorient", false);         if (temp == "not found") { temp = "F"; }\r
+                       reorient = m->isTrue(temp);\r
             \r
                }\r
        }\r
@@ -563,13 +567,22 @@ int SffInfoCommand::execute(){
 //**********************************************************************************************************************\r
 int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){\r
        try {\r
+        oligosObject = new Oligos();\r
                currentFileName = input;\r
                if (outputDir == "") {  outputDir += m->hasPath(input); }\r
                \r
                if (accnos != "")       {  readAccnosFile(accnos);  }\r
                else                            {       seqNames.clear();               }\r
         \r
-        if (hasOligos)   {   readOligos(oligos);    split = 2;      }\r
+        TrimOligos* trimOligos = NULL; TrimOligos* rtrimOligos = NULL;\r
+        if (hasOligos)   {\r
+            readOligos(oligos);    split = 2;\r
+            if (m->control_pressed) { delete oligosObject; return 0; }\r
+            trimOligos = new TrimOligos(pdiffs, bdiffs, ldiffs, sdiffs, oligosObject->getPrimers(), oligosObject->getBarcodes(), oligosObject->getReversePrimers(), oligosObject->getLinkers(), oligosObject->getSpacers());  numFPrimers = oligosObject->getPrimers().size(); numBarcodes = oligosObject->getBarcodes().size();\r
+            if (reorient) {\r
+                rtrimOligos = new TrimOligos(pdiffs, bdiffs, 0, 0, oligosObject->getReorientedPairedPrimers(), oligosObject->getReorientedPairedBarcodes()); numBarcodes = oligosObject->getReorientedPairedBarcodes().size();\r
+            }\r
+        }\r
         if (hasGroup)    {   readGroup(oligos);     split = 2;      }\r
         \r
                ofstream outSfftxt, outFasta, outQual, outFlow;\r
@@ -599,8 +612,8 @@ int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){
                int count = 0;\r
                \r
                //check magic number and version\r
-               if (header.magicNumber != 779314790) { m->mothurOut("Magic Number is not correct, not a valid .sff file"); m->mothurOutEndLine(); return count; }\r
-               if (header.version != "0001") { m->mothurOut("Version is not supported, only support version 0001."); m->mothurOutEndLine(); return count; }\r
+               if (header.magicNumber != 779314790) { m->mothurOut("Magic Number is not correct, not a valid .sff file"); m->mothurOutEndLine(); delete oligosObject; if (hasOligos)   { delete trimOligos; if (reorient) { delete  rtrimOligos; } } return count; }\r
+               if (header.version != "0001") { m->mothurOut("Version is not supported, only support version 0001."); m->mothurOutEndLine(); delete oligosObject; if (hasOligos)   { delete trimOligos; if (reorient) { delete  rtrimOligos; } } return count; }\r
        \r
                //print common header\r
                if (sfftxt) {   printCommonHeader(outSfftxt, header);           }\r
@@ -613,7 +626,7 @@ int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){
                                                \r
                        //read data\r
                        seqRead read;  Header readheader;\r
-            readSeqData(in, read, header.numFlowsPerRead, readheader);\r
+            readSeqData(in, read, header.numFlowsPerRead, readheader, trimOligos, rtrimOligos);\r
             \r
             bool okay = sanityCheck(readheader, read);\r
             if (!okay) { break; }\r
@@ -700,6 +713,9 @@ int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){
             else { outputNames.push_back(noMatchFile); outputTypes["sff"].push_back(noMatchFile); }\r
         }\r
         \r
+        delete oligosObject;\r
+        if (hasOligos)   { delete trimOligos; if (reorient) { delete  rtrimOligos; } }\r
+        \r
                return count;\r
        }\r
        catch(exception& e) {\r
@@ -1036,7 +1052,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){
        }\r
 }\r
 //**********************************************************************************************************************\r
-bool SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, Header& header){\r
+bool SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, Header& header, TrimOligos*& trimOligos, TrimOligos*& rtrimOligos){\r
        try {\r
         unsigned long long startSpotInFile = in.tellg();\r
                if (!in.eof()) {\r
@@ -1144,7 +1160,7 @@ bool SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads,
                \r
                 int barcodeIndex, primerIndex, trashCodeLength;\r
                 \r
-                if (hasOligos)      {  trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex);                }\r
+                if (hasOligos)      {  trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex, trimOligos, rtrimOligos);                }\r
                 else if (hasGroup)  {  trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex, "groupMode");   }\r
                 else {  m->mothurOut("[ERROR]: uh oh, we shouldn't be here...\n"); }\r
 \r
@@ -1189,10 +1205,8 @@ bool SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads,
        }\r
 }\r
 //**********************************************************************************************************************\r
-int SffInfoCommand::findGroup(Header header, seqRead read, int& barcode, int& primer) {\r
+int SffInfoCommand::findGroup(Header header, seqRead read, int& barcode, int& primer, TrimOligos*& trimOligos, TrimOligos*& rtrimOligos) {\r
        try {\r
-        //find group read belongs to\r
-        TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);\r
         \r
         int success = 1;\r
         string trashCode = "";\r
@@ -1229,55 +1243,81 @@ int SffInfoCommand::findGroup(Header header, seqRead read, int& barcode, int& pr
         Sequence currSeq(header.name, seq);\r
         QualityScores currQual;\r
         \r
+        //for reorient\r
+        Sequence savedSeq(currSeq.getName(), currSeq.getAligned());\r
+        QualityScores savedQual(currQual.getName(), currQual.getScores());\r
+        \r
         if(numLinkers != 0){\r
-            success = trimOligos.stripLinker(currSeq, currQual);\r
+            success = trimOligos->stripLinker(currSeq, currQual);\r
             if(success > ldiffs)               {       trashCode += 'k';       }\r
             else{ currentSeqsDiffs += success;  }\r
             \r
         }\r
         \r
-        if(barcodes.size() != 0){\r
-            success = trimOligos.stripBarcode(currSeq, currQual, barcode);\r
+        if(numBarcodes != 0){\r
+            success = trimOligos->stripBarcode(currSeq, currQual, barcode);\r
             if(success > bdiffs)               {       trashCode += 'b';       }\r
             else{ currentSeqsDiffs += success;  }\r
         }\r
         \r
         if(numSpacers != 0){\r
-            success = trimOligos.stripSpacer(currSeq, currQual);\r
+            success = trimOligos->stripSpacer(currSeq, currQual);\r
             if(success > sdiffs)               {       trashCode += 's';       }\r
             else{ currentSeqsDiffs += success;  }\r
             \r
         }\r
         \r
         if(numFPrimers != 0){\r
-            success = trimOligos.stripForward(currSeq, currQual, primer, true);\r
+            success = trimOligos->stripForward(currSeq, currQual, primer, true);\r
             if(success > pdiffs)               {       trashCode += 'f';       }\r
             else{ currentSeqsDiffs += success;  }\r
         }\r
         \r
         if (currentSeqsDiffs > tdiffs) {       trashCode += 't';   }\r
         \r
-        if(revPrimer.size() != 0){\r
-            success = trimOligos.stripReverse(currSeq, currQual);\r
+        if(numRPrimers != 0){\r
+            success = trimOligos->stripReverse(currSeq, currQual);\r
             if(!success)                               {       trashCode += 'r';       }\r
         }\r
 \r
-        if (trashCode.length() == 0) { //is this sequence in the ignore group\r
-            string thisGroup = "";\r
+        if (reorient && (trashCode != "")) { //if you failed and want to check the reverse\r
+            int thisSuccess = 0;\r
+            string thisTrashCode = "";\r
+            int thisCurrentSeqsDiffs = 0;\r
             \r
-            if(barcodes.size() != 0){\r
-                thisGroup = barcodeNameVector[barcode];\r
-                if (numFPrimers != 0) {\r
-                    if (primerNameVector[primer] != "") {\r
-                        if(thisGroup != "") {\r
-                            thisGroup += "." + primerNameVector[primer];\r
-                        }else {\r
-                            thisGroup = primerNameVector[primer];\r
-                        }\r
-                    }\r
-                }\r
+            int thisBarcodeIndex = 0;\r
+            int thisPrimerIndex = 0;\r
+            //cout << currSeq.getName() << '\t' << savedSeq.getUnaligned() << endl;\r
+            if(numBarcodes != 0){\r
+                thisSuccess = rtrimOligos->stripBarcode(savedSeq, savedQual, thisBarcodeIndex);\r
+                if(thisSuccess > bdiffs)               { thisTrashCode += "b"; }\r
+                else{ thisCurrentSeqsDiffs += thisSuccess;  }\r
+            }\r
+            //cout << currSeq.getName() << '\t' << savedSeq.getUnaligned() << endl;\r
+            if(numFPrimers != 0){\r
+                thisSuccess = rtrimOligos->stripForward(savedSeq, savedQual, thisPrimerIndex, true);\r
+                if(thisSuccess > pdiffs)               { thisTrashCode += "f"; }\r
+                else{ thisCurrentSeqsDiffs += thisSuccess;  }\r
             }\r
             \r
+            if (thisCurrentSeqsDiffs > tdiffs) {       thisTrashCode += 't';   }\r
+            \r
+            if (thisTrashCode == "") {\r
+                trashCode = thisTrashCode;\r
+                success = thisSuccess;\r
+                currentSeqsDiffs = thisCurrentSeqsDiffs;\r
+                barcode = thisBarcodeIndex;\r
+                primer = thisPrimerIndex;\r
+                savedSeq.reverseComplement();\r
+                currSeq.setAligned(savedSeq.getAligned());\r
+                savedQual.flipQScores();\r
+                currQual.setScores(savedQual.getScores());\r
+            }else { trashCode += "(" + thisTrashCode + ")";  }\r
+        }\r
+\r
+        if (trashCode.length() == 0) { //is this sequence in the ignore group\r
+            string thisGroup = oligosObject->getGroupName(barcode, primer);\r
+            \r
             int pos = thisGroup.find("ignore");\r
             if (pos != string::npos) {  trashCode += "i"; }\r
         }\r
@@ -1297,12 +1337,6 @@ int SffInfoCommand::findGroup(Header header, seqRead read, int& barcode, int& pr
         \r
         string group = groupMap->getGroup(header.name);\r
         if (group == "not found") {     trashCode += "g";   } //scrap for group\r
-        else { //find file group\r
-            map<string, int>::iterator it = barcodes.find(group);\r
-            if (it != barcodes.end()) {\r
-                barcode = it->second;\r
-            }else { trashCode += "g"; }\r
-        }\r
         \r
         return trashCode.length();\r
     }\r
@@ -1833,121 +1867,59 @@ bool SffInfoCommand::readOligos(string oligoFile){
         numSplitReads.clear();\r
         filehandlesHeaders.clear();\r
         \r
-               ifstream inOligos;\r
-               m->openInputFile(oligoFile, inOligos);\r
-               \r
-               string type, oligo, group;\r
+               bool allBlank = false;\r
+        oligosObject->read(oligoFile);\r
         \r
-               int indexPrimer = 0;\r
-               int indexBarcode = 0;\r
-               \r
-               while(!inOligos.eof()){\r
-            \r
-                       inOligos >> type;\r
-            \r
-                       if(type[0] == '#'){\r
-                               while (!inOligos.eof()) {       char c = inOligos.get();  if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there\r
-                               m->gobble(inOligos);\r
-                       }\r
-                       else{\r
-                               m->gobble(inOligos);\r
-                               //make type case insensitive\r
-                               for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }\r
-                               \r
-                               inOligos >> oligo;\r
-                               \r
-                               for(int i=0;i<oligo.length();i++){\r
-                                       oligo[i] = toupper(oligo[i]);\r
-                                       if(oligo[i] == 'U')     {       oligo[i] = 'T'; }\r
-                               }\r
-                               \r
-                               if(type == "FORWARD"){\r
-                                       group = "";\r
-                                       \r
-                                       // get rest of line in case there is a primer name\r
-                                       while (!inOligos.eof()) {\r
-                                               char c = inOligos.get();\r
-                                               if (c == 10 || c == 13 || c == -1){     break;  }\r
-                                               else if (c == 32 || c == 9){;} //space or tab\r
-                                               else {  group += c;  }\r
-                                       }\r
-                                       \r
-                                       //check for repeat barcodes\r
-                                       map<string, int>::iterator itPrime = primers.find(oligo);\r
-                                       if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }\r
-                                       \r
-                                       primers[oligo]=indexPrimer; indexPrimer++;\r
-                                       primerNameVector.push_back(group);\r
-                   \r
-                               }else if(type == "REVERSE"){\r
-                                       //Sequence oligoRC("reverse", oligo);\r
-                                       //oligoRC.reverseComplement();\r
-                    string oligoRC = reverseOligo(oligo);\r
-                                       revPrimer.push_back(oligoRC);\r
-                               }\r
-                               else if(type == "BARCODE"){\r
-                                       inOligos >> group;\r
-                    \r
-                                       \r
-                                       //check for repeat barcodes\r
-                                       map<string, int>::iterator itBar = barcodes.find(oligo);\r
-                                       if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }\r
-                    \r
-                                       barcodes[oligo]=indexBarcode; indexBarcode++;\r
-                                       barcodeNameVector.push_back(group);\r
-                               }else if(type == "LINKER"){\r
-                                       linker.push_back(oligo);\r
-                               }else if(type == "SPACER"){\r
-                                       spacer.push_back(oligo);\r
-                               }\r
-                               else{   m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }\r
-                       }\r
-                       m->gobble(inOligos);\r
-               }\r
-               inOligos.close();\r
-    \r
-               if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ split = 1;      }\r
-    \r
-               //add in potential combos\r
-               if(barcodeNameVector.size() == 0){\r
-                       barcodes[""] = 0;\r
-                       barcodeNameVector.push_back("");\r
-               }\r
-               \r
-               if(primerNameVector.size() == 0){\r
-                       primers[""] = 0;\r
-                       primerNameVector.push_back("");\r
-               }\r
-               \r
-               filehandles.resize(barcodeNameVector.size());\r
+        if (m->control_pressed) { return false; } //error in reading oligos\r
+        \r
+        if (oligosObject->hasPairedBarcodes()) {\r
+            pairedOligos = true;\r
+            m->mothurOut("[ERROR]: sffinfo does not support paired barcodes and primers, aborting.\n"); m->control_pressed = true; return true;\r
+        }else {\r
+            pairedOligos = false;\r
+            numFPrimers = oligosObject->getPrimers().size();\r
+            numBarcodes = oligosObject->getBarcodes().size();\r
+        }\r
+        \r
+        numLinkers = oligosObject->getLinkers().size();\r
+        numSpacers = oligosObject->getSpacers().size();\r
+        numRPrimers = oligosObject->getReversePrimers().size();\r
+        \r
+        vector<string> groupNames = oligosObject->getGroupNames();\r
+        if (groupNames.size() == 0) { allBlank = true;  }\r
+        \r
+        filehandles.resize(oligosObject->getBarcodeNames().size());\r
                for(int i=0;i<filehandles.size();i++){\r
-                       filehandles[i].assign(primerNameVector.size(), "");\r
+            for(int j=0;j<oligosObject->getPrimerNames().size();j++){  filehandles[i].push_back(""); }\r
                }\r
         \r
-               if(split > 1){\r
-                       set<string> uniqueNames; //used to cleanup outputFileNames\r
-                       for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){\r
-                               for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){\r
-                                       \r
-                                       string primerName = primerNameVector[itPrimer->second];\r
-                                       string barcodeName = barcodeNameVector[itBar->second];\r
-                                       \r
+        if(split > 1){\r
+            set<string> uniqueNames; //used to cleanup outputFileNames\r
+            map<string, int> barcodes = oligosObject->getBarcodes() ;\r
+            map<string, int> primers = oligosObject->getPrimers();\r
+            for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){\r
+                for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){\r
+                    \r
+                    string primerName = oligosObject->getPrimerName(itPrimer->second);\r
+                    string barcodeName = oligosObject->getBarcodeName(itBar->second);\r
+                    \r
                     if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing\r
+                    else if ((primerName == "") && (barcodeName == "")) { } //do nothing\r
                     else {\r
                         string comboGroupName = "";\r
                         string fastaFileName = "";\r
                         string qualFileName = "";\r
                         string nameFileName = "";\r
+                        string countFileName = "";\r
                         \r
                         if(primerName == ""){\r
-                            comboGroupName = barcodeNameVector[itBar->second];\r
-                        }\r
-                        else{\r
+                            comboGroupName = barcodeName;\r
+                        }else{\r
                             if(barcodeName == ""){\r
-                                comboGroupName = primerNameVector[itPrimer->second];\r
+                                comboGroupName = primerName;\r
                             }\r
                             else{\r
-                                comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];\r
+                                comboGroupName = barcodeName + "." + primerName;\r
                             }\r
                         }\r
                         \r
@@ -1965,33 +1937,16 @@ bool SffInfoCommand::readOligos(string oligoFile){
                         filehandles[itBar->second][itPrimer->second] = thisFilename;\r
                         temp.open(thisFilename.c_str(), ios::binary);          temp.close();\r
                     }\r
-                               }\r
-                       }\r
-               }\r
-               numFPrimers = primers.size();\r
-        numLinkers = linker.size();\r
-        numSpacers = spacer.size();\r
+                }\r
+            }\r
+        }\r
+        \r
         map<string, string> variables;\r
         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(currentFileName));\r
         variables["[group]"] = "scrap";\r
                noMatchFile = getOutputFileName("sff",variables);\r
         m->mothurRemove(noMatchFile);\r
         numNoMatch = 0;\r
-        \r
-        \r
-               bool allBlank = true;\r
-               for (int i = 0; i < barcodeNameVector.size(); i++) {\r
-                       if (barcodeNameVector[i] != "") {\r
-                               allBlank = false;\r
-                               break;\r
-                       }\r
-               }\r
-               for (int i = 0; i < primerNameVector.size(); i++) {\r
-                       if (primerNameVector[i] != "") {\r
-                               allBlank = false;\r
-                               break;\r
-                       }\r
-               }\r
                \r
         filehandlesHeaders.resize(filehandles.size());\r
         numSplitReads.resize(filehandles.size());\r
@@ -2023,7 +1978,6 @@ bool SffInfoCommand::readGroup(string oligoFile){
         filehandles.clear();\r
         numSplitReads.clear();\r
         filehandlesHeaders.clear();\r
-        barcodes.clear();\r
         \r
         groupMap = new GroupMap();\r
         groupMap->readMap(oligoFile);\r
@@ -2045,7 +1999,6 @@ bool SffInfoCommand::readGroup(string oligoFile){
                 ofstream temp;\r
                 m->openOutputFileBinary(thisFilename, temp); temp.close();\r
                 filehandles[i].push_back(thisFilename);\r
-                barcodes[groups[i]] = i;\r
             }\r
         }\r
         \r