]> git.donarmstrong.com Git - mothur.git/blobdiff - sequencecountparser.cpp
added SequenceCountParser class to parse the count table by group. added count parame...
[mothur.git] / sequencecountparser.cpp
diff --git a/sequencecountparser.cpp b/sequencecountparser.cpp
new file mode 100644 (file)
index 0000000..4ba6091
--- /dev/null
@@ -0,0 +1,289 @@
+//
+//  sequencecountparser.cpp
+//  Mothur
+//
+//  Created by Sarah Westcott on 8/7/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "sequencecountparser.h"
+
+/************************************************************/
+SequenceCountParser::SequenceCountParser(string countfile, string fastafile) {
+       try {
+               
+               m = MothurOut::getInstance();
+               
+               //read count file
+               CountTable countTable;
+               countTable.readTable(countfile);
+               
+               //initialize maps
+               namesOfGroups = countTable.getNamesOfGroups();
+               for (int i = 0; i < namesOfGroups.size(); i++) {
+                       vector<Sequence> temp;
+                       map<string, int> tempMap;
+                       seqs[namesOfGroups[i]] = temp;
+                       countTablePerGroup[namesOfGroups[i]] = tempMap;
+               }
+               
+               //read fasta file making sure each sequence is in the group file
+               ifstream in;
+               m->openInputFile(fastafile, in);
+               
+        int fastaCount = 0;
+               while (!in.eof()) {
+                       
+                       if (m->control_pressed) { break; }
+                       
+                       Sequence seq(in); m->gobble(in);
+            fastaCount++;
+            if (m->debug) { if((fastaCount) % 1000 == 0){      m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
+                       
+            if (seq.getName() != "") {
+                               
+                allSeqsMap[seq.getName()] = seq.getName();
+                vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
+                
+                for (int i = 0; i < namesOfGroups.size(); i++) {
+                    if (groupCounts[i] != 0) {
+                        seqs[namesOfGroups[i]].push_back(seq); 
+                        countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
+                    }
+                }
+                       }
+               }
+               in.close();                                     
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "SequenceCountParser");
+               exit(1);
+       }
+}
+/************************************************************/
+SequenceCountParser::SequenceCountParser(string fastafile, CountTable& countTable) {
+       try {
+               
+               m = MothurOut::getInstance();
+                               
+               //initialize maps
+        if (countTable.hasGroupInfo()) {
+            namesOfGroups = countTable.getNamesOfGroups();
+            for (int i = 0; i < namesOfGroups.size(); i++) {
+                vector<Sequence> temp;
+                map<string, int> tempMap;
+                seqs[namesOfGroups[i]] = temp;
+                countTablePerGroup[namesOfGroups[i]] = tempMap;
+            }
+            
+            //read fasta file making sure each sequence is in the group file
+            ifstream in;
+            m->openInputFile(fastafile, in);
+            
+            int fastaCount = 0;
+            while (!in.eof()) {
+                
+                if (m->control_pressed) { break; }
+                
+                Sequence seq(in); m->gobble(in);
+                fastaCount++;
+                if (m->debug) { if((fastaCount) % 1000 == 0){  m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
+                
+                if (seq.getName() != "") {
+                    
+                    allSeqsMap[seq.getName()] = seq.getName();
+                    vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
+                    
+                    for (int i = 0; i < namesOfGroups.size(); i++) {
+                        if (groupCounts[i] != 0) {
+                            seqs[namesOfGroups[i]].push_back(seq);     
+                            countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
+                        }
+                    }
+                }
+            }
+            in.close();        
+        }else {  m->control_pressed = true;  m->mothurOut("[ERROR]: cannot parse fasta file by group with a count table that does not include group data, please correct.\n"); }
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "SequenceCountParser");
+               exit(1);
+       }
+}
+/************************************************************/
+SequenceCountParser::~SequenceCountParser(){  }
+/************************************************************/
+int SequenceCountParser::getNumGroups(){ return namesOfGroups.size(); }
+/************************************************************/
+vector<string> SequenceCountParser::getNamesOfGroups(){ return namesOfGroups; }
+/************************************************************/
+int SequenceCountParser::getNumSeqs(string g){ 
+       try {
+               map<string, vector<Sequence> >::iterator it;
+               int num = 0;
+               
+               it = seqs.find(g);
+               if(it == seqs.end()) {
+                       m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
+               }else {
+                       num = (it->second).size();
+               }
+               
+               return num; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "getNumSeqs");
+               exit(1);
+       }
+}
+/************************************************************/
+vector<Sequence> SequenceCountParser::getSeqs(string g){ 
+       try {
+               map<string, vector<Sequence> >::iterator it;
+               vector<Sequence> seqForThisGroup;
+               
+               it = seqs.find(g);
+               if(it == seqs.end()) {
+                       m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       seqForThisGroup = it->second;
+            if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences.");  }
+               }
+               
+               return seqForThisGroup; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "getSeqs");
+               exit(1);
+       }
+}
+/************************************************************/
+int SequenceCountParser::getSeqs(string g, string filename, bool uchimeFormat=false){ 
+       try {
+               map<string, vector<Sequence> >::iterator it;
+               vector<Sequence> seqForThisGroup;
+               vector<seqPriorityNode> nameVector;
+               
+               it = seqs.find(g);
+               if(it == seqs.end()) {
+                       m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       
+                       ofstream out;
+                       m->openOutputFile(filename, out);
+                       
+                       seqForThisGroup = it->second;
+                       
+                       if (uchimeFormat) {
+                               // format should look like 
+                               //>seqName /ab=numRedundantSeqs/
+                               //sequence
+                               
+                               map<string, int> countForThisGroup = getCountTable(g);
+                               map<string, int>::iterator itCount;
+                               int error = 0;
+                               
+                               for (int i = 0; i < seqForThisGroup.size(); i++) {
+                                       itCount = countForThisGroup.find(seqForThisGroup[i].getName());
+                                       
+                                       if (itCount == countForThisGroup.end()){
+                                               error = 1;
+                                               m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your count file, please correct."); m->mothurOutEndLine();
+                                       }else {
+                        seqPriorityNode temp(itCount->second, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
+                                               nameVector.push_back(temp);
+                                       }
+                               }
+                               
+                               if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
+                               
+                               //sort by num represented
+                               sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
+                
+                               //print new file in order of
+                               for (int i = 0; i < nameVector.size(); i++) {
+                                       
+                                       if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+                                       
+                                       out << ">" << nameVector[i].name  << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl;
+                               }
+                               
+                       }else { 
+                //m->mothurOut("Group " + g +  " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n");
+                               for (int i = 0; i < seqForThisGroup.size(); i++) {
+                                       
+                                       if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+                                       
+                                       seqForThisGroup[i].printSequence(out);  
+                               }
+                       }
+                       out.close();
+               }
+               
+               return 0; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "getSeqs");
+               exit(1);
+       }
+}
+
+/************************************************************/
+map<string, int> SequenceCountParser::getCountTable(string g){ 
+       try {
+               map<string, map<string, int> >::iterator it;
+               map<string, int> countForThisGroup;
+               
+               it = countTablePerGroup.find(g);
+               if(it == countTablePerGroup.end()) {
+                       m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       countForThisGroup = it->second;
+            if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " count file has " + toString(countForThisGroup.size()) + " unique sequences.");  }
+               }
+               
+               return countForThisGroup; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "getCountTable");
+               exit(1);
+       }
+}
+/************************************************************/
+int SequenceCountParser::getCountTable(string g, string filename){ 
+       try {
+               map<string, map<string, int> >::iterator it;
+               map<string, int> countForThisGroup;
+               
+               it = countTablePerGroup.find(g);
+               if(it == countTablePerGroup.end()) {
+                       m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       countForThisGroup = it->second;
+                       
+                       ofstream out;
+                       m->openOutputFile(filename, out);
+            out << "Representative_Sequence\ttotal\t" << g << endl;
+            
+                       for (map<string, int>::iterator itFile = countForThisGroup.begin(); itFile != countForThisGroup.end(); itFile++) {
+                               
+                               if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+                               
+                               out << itFile->first << '\t' << itFile->second << '\t' << itFile->second << endl;
+                       }
+                       
+                       out.close();
+               }
+               
+               return 0; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceParser", "getCountTable");
+               exit(1);
+       }
+}
+/************************************************************/
+
+
+