]> git.donarmstrong.com Git - mothur.git/blobdiff - sequenceparser.cpp
fixed bug with dist.shared subsampling. added mode parameter to dist.shared so...
[mothur.git] / sequenceparser.cpp
index 44012d8ba8321fea520fbe613e0dac8f8911c760..3eb508dd737e15d4f06cb04c202608b5d5e1fe7c 100644 (file)
@@ -7,7 +7,7 @@
  *
  */
 
-#include "sequenceParser.h"
+#include "sequenceparser.h"
 
 
 /************************************************************/
@@ -37,13 +37,16 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi
                m->openInputFile(fastaFile, in);
                
                map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
+        int fastaCount = 0;
                while (!in.eof()) {
                        
                        if (m->control_pressed) { break; }
                        
                        Sequence seq(in); m->gobble(in);
+            fastaCount++;
+            if (m->debug) { if((fastaCount) % 1000 == 0){      m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
                        
-                       if (seq.getName() != "") {
+        if (seq.getName() != "") {
                                
                                 string group = groupMap->getGroup(seq.getName());
                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
@@ -63,6 +66,8 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi
                
                string first, second;
                int countName = 0;
+               set<string> thisnames1;
+               
                while(!inName.eof()) {
                        
                        if (m->control_pressed) { break; }
@@ -94,10 +99,12 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi
                                        it = splitMap.find(group);
                                        if (it != splitMap.end()) { //adding seqs to this group
                                                (it->second) += "," + names[i];
+                                               thisnames1.insert(names[i]);
                                                countName++;
                                        }else { //first sighting of this group
                                                splitMap[group] = names[i];
                                                countName++;
+                                               thisnames1.insert(names[i]);
                                                
                                                //is this seq in the fasta file?
                                                if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
@@ -106,6 +113,8 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi
                                                }
                                        }
                                }
+                               
+                               allSeqsMap[names[i]] = names[0];
                        }
                        
                        
@@ -127,8 +136,17 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi
                inName.close();
                
                if (error == 1) { m->control_pressed = true; }
-               
+                       
                if (countName != (groupMap->getNumSeqs())) {
+                       vector<string> groupseqsnames = groupMap->getNamesSeqs();
+                       
+                       for (int i = 0; i < groupseqsnames.size(); i++) {
+                               set<string>::iterator itnamesfile = thisnames1.find(groupseqsnames[i]);
+                               if (itnamesfile == thisnames1.end()){
+                                       cout << "missing name " + groupseqsnames[i] << '\t' << allSeqsMap[groupseqsnames[i]] << endl;
+                               }
+                       }
+                       
                        m->mothurOutEndLine();
                        m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
                        m->mothurOutEndLine();
@@ -238,6 +256,7 @@ vector<Sequence> SequenceParser::getSeqs(string g){
                        m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
                }else {
                        seqForThisGroup = it->second;
+            if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences.");  }
                }
                
                return seqForThisGroup; 
@@ -247,6 +266,79 @@ vector<Sequence> SequenceParser::getSeqs(string g){
                exit(1);
        }
 }
+/************************************************************/
+int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){ 
+       try {
+               map<string, vector<Sequence> >::iterator it;
+               vector<Sequence> seqForThisGroup;
+               vector<seqPriorityNode> nameVector;
+               
+               it = seqs.find(g);
+               if(it == seqs.end()) {
+                       m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       
+                       ofstream out;
+                       m->openOutputFile(filename, out);
+                       
+                       seqForThisGroup = it->second;
+                       
+                       if (uchimeFormat) {
+                               // format should look like 
+                               //>seqName /ab=numRedundantSeqs/
+                               //sequence
+                               
+                               map<string, string> nameMapForThisGroup = getNameMap(g);
+                               map<string, string>::iterator itNameMap;
+                               int error = 0;
+                               
+                               for (int i = 0; i < seqForThisGroup.size(); i++) {
+                                       itNameMap = nameMapForThisGroup.find(seqForThisGroup[i].getName());
+                                       
+                                       if (itNameMap == nameMapForThisGroup.end()){
+                                               error = 1;
+                                               m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your namesfile, please correct."); m->mothurOutEndLine();
+                                       }else {
+                                               int num = m->getNumNames(itNameMap->second);
+                                               
+                                               seqPriorityNode temp(num, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
+                                               nameVector.push_back(temp);
+                                       }
+                               }
+                               
+                               if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
+                               
+                               //sort by num represented
+                               sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
+
+                               //print new file in order of
+                               for (int i = 0; i < nameVector.size(); i++) {
+                                       
+                                       if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+                                       
+                                       out << ">" << nameVector[i].name  << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl;
+                               }
+                               
+                       }else { 
+                //m->mothurOut("Group " + g +  " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n");
+                               for (int i = 0; i < seqForThisGroup.size(); i++) {
+                                       
+                                       if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+                                       
+                                       seqForThisGroup[i].printSequence(out);  
+                               }
+                       }
+                       out.close();
+               }
+               
+               return 0; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceParser", "getSeqs");
+               exit(1);
+       }
+}
+
 /************************************************************/
 map<string, string> SequenceParser::getNameMap(string g){ 
        try {
@@ -258,6 +350,7 @@ map<string, string> SequenceParser::getNameMap(string g){
                        m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
                }else {
                        nameMapForThisGroup = it->second;
+            if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " name file has " + toString(nameMapForThisGroup.size()) + " unique sequences.");  }
                }
                
                return nameMapForThisGroup; 
@@ -268,6 +361,38 @@ map<string, string> SequenceParser::getNameMap(string g){
        }
 }
 /************************************************************/
+int SequenceParser::getNameMap(string g, string filename){ 
+       try {
+               map<string, map<string, string> >::iterator it;
+               map<string, string> nameMapForThisGroup;
+               
+               it = nameMapPerGroup.find(g);
+               if(it == nameMapPerGroup.end()) {
+                       m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       nameMapForThisGroup = it->second;
+                       
+                       ofstream out;
+                       m->openOutputFile(filename, out);
+                       
+                       for (map<string, string>::iterator itFile = nameMapForThisGroup.begin(); itFile != nameMapForThisGroup.end(); itFile++) {
+                               
+                               if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+                               
+                               out << itFile->first << '\t' << itFile->second << endl;
+                       }
+                       
+                       out.close();
+               }
+               
+               return 0; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceParser", "getNameMap");
+               exit(1);
+       }
+}
+/************************************************************/