added SequenceCountParser class to parse the count table by group. added count parame...

author Sarah Westcott <mothur.westcott@gmail.com>

Wed, 8 Aug 2012 15:07:46 +0000 (11:07 -0400)

committer Sarah Westcott <mothur.westcott@gmail.com>

Wed, 8 Aug 2012 15:07:46 +0000 (11:07 -0400)
author Sarah Westcott <mothur.westcott@gmail.com>
Wed, 8 Aug 2012 15:07:46 +0000 (11:07 -0400)
committer Sarah Westcott <mothur.westcott@gmail.com>
Wed, 8 Aug 2012 15:07:46 +0000 (11:07 -0400)
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj

index ff0e35bbb0bac1d955c7629e75eeb872e2cc1d65..104c6d5c12c6e4185823e2d5e833760134f8c16a 100644 (file)
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -24,6 +24,7 @@
                 A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73901071588C40900ED2ED6 /* loadlogfilecommand.cpp */; };
                 A73DDBBA13C4A0D1006AAE38 /* clearmemorycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */; };
                 A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */; };
+               A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */; };
                 A74A9A9F148E881E00AB5E3E /* spline.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74A9A9E148E881E00AB5E3E /* spline.cpp */; };
                 A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; };
                 A74D59A4159A1E2000043046 /* counttable.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D59A3159A1E2000043046 /* counttable.cpp */; };
@@ -397,6 +398,8 @@
                 A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clearmemorycommand.cpp; sourceTree = "<group>"; };
                 A73DDC3613C4BF64006AAE38 /* mothurmetastats.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mothurmetastats.h; sourceTree = "<group>"; };
                 A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mothurmetastats.cpp; sourceTree = "<group>"; };
+               A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequencecountparser.cpp; sourceTree = "<group>"; };
+               A741FAD415D168A00067BCC5 /* sequencecountparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequencecountparser.h; sourceTree = "<group>"; };
                 A74A9A9D148E881E00AB5E3E /* spline.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = spline.h; sourceTree = "<group>"; };
                 A74A9A9E148E881E00AB5E3E /* spline.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = spline.cpp; sourceTree = "<group>"; };
                 A74D36B6137DAFAA00332B0C /* chimerauchimecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerauchimecommand.h; sourceTree = "<group>"; };
@@ -1659,6 +1662,8 @@
                                 A7E9B7D012D37EC400DA6239 /* sabundvector.hpp */,
                                 A7E9B7DB12D37EC400DA6239 /* sequence.cpp */,
                                 A7E9B7DC12D37EC400DA6239 /* sequence.hpp */,
+                               A741FAD415D168A00067BCC5 /* sequencecountparser.h */,
+                               A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */,
                                 A7E9B7DD12D37EC400DA6239 /* sequencedb.cpp */,
                                 A7E9B7DE12D37EC400DA6239 /* sequencedb.h */,
                                 A7F9F5CD141A5E500032F693 /* sequenceparser.h */,
@@ -2192,6 +2197,7 @@
                                 A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */,
                                 A74D59A4159A1E2000043046 /* counttable.cpp in Sources */,
                                 A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */,
+                               A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */,
                         );
                         runOnlyForDeploymentPostprocessing = 0;
                 };
diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp

index e3691e8b942c2f34da91cec14691e717d5c16c1a..0955f2dc662dc903c49b34d1fe70103ef8a2b5b0 100644 (file)
--- a/chimeraperseuscommand.cpp
+++ b/chimeraperseuscommand.cpp
@@ -10,12 +10,15 @@
  #include "chimeraperseuscommand.h"
  #include "deconvolutecommand.h"
  #include "sequence.hpp"
+#include "counttable.h"
+#include "sequencecountparser.h"
  //**********************************************************************************************************************
  vector<string> ChimeraPerseusCommand::setParameters(){ 
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+               CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
@@ -36,10 +39,11 @@ vector<string> ChimeraPerseusCommand::setParameters(){
  string ChimeraPerseusCommand::getHelpString(){ 
         try {
                 string helpString = "";
-               helpString += "The chimera.perseus command reads a fastafile and namefile and outputs potentially chimeric sequences.\n";
+               helpString += "The chimera.perseus command reads a fastafile and namefile or countfile and outputs potentially chimeric sequences.\n";
                 helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n";
                 helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
-               helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n";
+               helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n";
+        helpString += "The count parameter allows you to provide a count file associated with your fasta file. A count or name file is required. \n";
                 helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
                 helpString += "The group parameter allows you to provide a group file.  When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
                 helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
@@ -96,6 +100,8 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(){
  ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
         try {
                 abort = false; calledHelp = false; 
+        hasCount = false;
+        hasName = false;
                 
                 //allow user to run help
                 if(option == "help") { help(); abort = true; calledHelp = true; }
@@ -107,7 +113,7 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
                         OptionParser parser(option);
                         map<string,string> parameters = parser.getParameters();
                         
-                       ValidParameters validParameter("chimera.uchime");
+                       ValidParameters validParameter("chimera.perseus");
                         map<string,string>::iterator it;
                         
                         //check to make sure all parameters are valid for command
@@ -203,15 +209,9 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
                         
                         
                         //check for required parameters
-                       bool hasName = true;
                         namefile = validParameter.validFile(parameters, "name", false);
-                       if (namefile == "not found") { 
-                               //if there is a current fasta file, use it
-                               string filename = m->getNameFile(); 
-                               if (filename != "") { nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); }
-                               else {  m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }                                
-                               hasName = false;
-                       }else { 
+                       if (namefile == "not found") { namefile = "";   }
+                       else { 
                                 m->splitAtDash(namefile, nameFileNames);
                                 
                                 //go through files and make sure they are good, if not, then disregard them
@@ -277,12 +277,101 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
                                                 }
                                         }
                                 }
+                       }
+            
+            if (nameFileNames.size() != 0) { hasName = true; }
+            
+            //check for required parameters
+            vector<string> countfileNames;
+                       countfile = validParameter.validFile(parameters, "count", false);
+                       if (countfile == "not found") { 
+                countfile = "";  
+                       }else { 
+                               m->splitAtDash(countfile, countfileNames);
                                 
-                               //make sure there is at least one valid file left
-                               if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; }
+                               //go through files and make sure they are good, if not, then disregard them
+                               for (int i = 0; i < countfileNames.size(); i++) {
+                                       
+                                       bool ignore = false;
+                                       if (countfileNames[i] == "current") { 
+                                               countfileNames[i] = m->getCountTableFile(); 
+                                               if (nameFileNames[i] != "") {  m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); }
+                                               else {  
+                                                       m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; 
+                                                       //erase from file list
+                                                       countfileNames.erase(countfileNames.begin()+i);
+                                                       i--;
+                                               }
+                                       }
+                                       
+                                       if (!ignore) {
+                                               
+                                               if (inputDir != "") {
+                                                       string path = m->hasPath(countfileNames[i]);
+                                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                                       if (path == "") {       countfileNames[i] = inputDir + countfileNames[i];               }
+                                               }
+                                               
+                                               int ableToOpen;
+                                               ifstream in;
+                                               
+                                               ableToOpen = m->openInputFile(countfileNames[i], in, "noerror");
+                                               
+                                               //if you can't open it, try default location
+                                               if (ableToOpen == 1) {
+                                                       if (m->getDefaultPath() != "") { //default path is set
+                                                               string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]);
+                                                               m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
+                                                               ifstream in2;
+                                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                               in2.close();
+                                                               countfileNames[i] = tryPath;
+                                                       }
+                                               }
+                                               
+                                               if (ableToOpen == 1) {
+                                                       if (m->getOutputDir() != "") { //default path is set
+                                                               string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]);
+                                                               m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+                                                               ifstream in2;
+                                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                               in2.close();
+                                                               countfileNames[i] = tryPath;
+                                                       }
+                                               }
+                                               
+                                               in.close();
+                                               
+                                               if (ableToOpen == 1) { 
+                                                       m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); 
+                                                       //erase from file list
+                                                       countfileNames.erase(countfileNames.begin()+i);
+                                                       i--;
+                                               }else {
+                                                       m->setCountTableFile(countfileNames[i]);
+                                               }
+                                       }
+                               }
                         }
-                       
-                       if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
+            
+            if (countfileNames.size() != 0) { hasCount = true; }
+            
+                       //make sure there is at least one valid file left
+            if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
+            
+            if (!hasName && !hasCount) { 
+                //if there is a current name file, use it, else look for current count file
+                               string filename = m->getNameFile(); 
+                               if (filename != "") { hasName = true; nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); }
+                               else { 
+                    filename = m->getCountTableFile();
+                    if (filename != "") { hasCount = true; countfileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                    else { m->mothurOut("[ERROR]: You must provide a count or name file."); m->mothurOutEndLine(); abort = true;  }
+                }
+            }
+            if (!hasName && hasCount) { nameFileNames = countfileNames; }
+            
+                       if (nameFileNames.size() != fastaFileNames.size()) { m->mothurOut("[ERROR]: The number of name or count files does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
                         
                         bool hasGroup = true;
                         groupfile = validParameter.validFile(parameters, "group", false);
@@ -360,6 +449,7 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
                         
                         if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
                         
+            if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; }
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
@@ -415,41 +505,82 @@ int ChimeraPerseusCommand::execute(){
                         
                         int numSeqs = 0;
                         int numChimeras = 0;
-                       
-                       if (groupFile != "") {
-                               //Parse sequences by group
-                               SequenceParser parser(groupFile, fastaFileNames[s], nameFile);
-                               vector<string> groups = parser.getNamesOfGroups();
-                               
-                               if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        }  return 0; }
-                               
-                               //clears files
-                               ofstream out, out1, out2;
-                               m->openOutputFile(outputFileName, out); out.close(); 
-                               m->openOutputFile(accnosFileName, out1); out1.close();
-                               
-                               if(processors == 1)     {       numSeqs = driverGroups(parser, outputFileName, accnosFileName, 0, groups.size(), groups);       }
-                               else                            {       numSeqs = createProcessesGroups(parser, outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile);                        }
-                               
-                               if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
-                               
-                               numChimeras = deconvoluteResults(parser, outputFileName, accnosFileName);
-                               
-                               m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
-                               
-                               if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
-                               
-                       }else{
-                               if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
-                               
-                               //read sequences and store sorted by frequency
-                               vector<seqData> sequences = readFiles(fastaFileNames[s], nameFile);
-                               
-                               if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        } return 0; }
-                               
-                               numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras); 
+            
+            if (hasCount) {
+                CountTable* ct = new CountTable();
+                ct->readTable(nameFile);
+                
+                if (ct->hasGroupInfo()) {
+                    cparser = new SequenceCountParser(fastaFileNames[s], *ct);
+                    
+                    vector<string> groups = cparser->getNamesOfGroups();
+                    
+                    if (m->control_pressed) { delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        }  return 0; }
+                    
+                    //clears files
+                    ofstream out, out1, out2;
+                    m->openOutputFile(outputFileName, out); out.close(); 
+                    m->openOutputFile(accnosFileName, out1); out1.close();
+                    
+                    if(processors == 1)        {       numSeqs = driverGroups(outputFileName, accnosFileName, 0, groups.size(), groups);       }
+                    else                               {       numSeqs = createProcessesGroups(outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile);                        }
+                    
+                    if (m->control_pressed) {  delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
+                    map<string, string> uniqueNames = cparser->getAllSeqsMap();
+                    numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+                    delete cparser;
+
+                    m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
+                    
+                    if (m->control_pressed) {  delete ct; for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;  } 
+                    
+                }else {
+                    if (processors != 1) { m->mothurOut("Your count file does not contain group information, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
+                    
+                    //read sequences and store sorted by frequency
+                    vector<seqData> sequences = readFiles(fastaFileNames[s], ct);
+                    
+                    if (m->control_pressed) { delete ct; for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        } return 0; }
+                    
+                    numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras);   
+                }
+                delete ct;
+            }else {
+                if (groupFile != "") {
+                    //Parse sequences by group
+                    parser = new SequenceParser(groupFile, fastaFileNames[s], nameFile);
+                    vector<string> groups = parser->getNamesOfGroups();
+                    
+                    if (m->control_pressed) { delete parser; for (int j = 0; j < outputNames.size(); j++) {    m->mothurRemove(outputNames[j]);        }  return 0; }
+                    
+                    //clears files
+                    ofstream out, out1, out2;
+                    m->openOutputFile(outputFileName, out); out.close(); 
+                    m->openOutputFile(accnosFileName, out1); out1.close();
+                    
+                    if(processors == 1)        {       numSeqs = driverGroups(outputFileName, accnosFileName, 0, groups.size(), groups);       }
+                    else                               {       numSeqs = createProcessesGroups(outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile);                        }
+                    
+                    if (m->control_pressed) {  delete parser; for (int j = 0; j < outputNames.size(); j++) {   m->mothurRemove(outputNames[j]);        }  return 0;    }                               
+                    map<string, string> uniqueNames = parser->getAllSeqsMap();
+                    numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+                    delete parser;
+                    
+                    m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
+                    
+                    if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {  m->mothurRemove(outputNames[j]);        }  return 0;  }         
+                }else{
+                    if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
+                    
+                    //read sequences and store sorted by frequency
+                    vector<seqData> sequences = readFiles(fastaFileNames[s], nameFile);
+                    
+                    if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {   m->mothurRemove(outputNames[j]);        } return 0; }
+                    
+                    numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras); 
+                }
                         }
-                       
+            
                         if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        } return 0; }
                         
                         m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences. " + toString(numChimeras) + " chimeras were found.");      m->mothurOutEndLine();
@@ -510,7 +641,7 @@ string ChimeraPerseusCommand::getNamesFile(string& inputFile){
         }
  }
  //**********************************************************************************************************************
-int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFName, string accnos, int start, int end, vector<string> groups){
+int ChimeraPerseusCommand::driverGroups(string outputFName, string accnos, int start, int end, vector<string> groups){
         try {
                 
                 int totalSeqs = 0;
@@ -522,7 +653,7 @@ int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFNa
                         
                         int start = time(NULL);  if (m->control_pressed) {  return 0; }
                         
-                       vector<seqData> sequences = loadSequences(parser, groups[i]);
+                       vector<seqData> sequences = loadSequences(groups[i]);
                         
                         if (m->control_pressed) { return 0; }
                         
@@ -547,32 +678,48 @@ int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFNa
         }
  }      
  //**********************************************************************************************************************
-vector<seqData> ChimeraPerseusCommand::loadSequences(SequenceParser& parser, string group){
+vector<seqData> ChimeraPerseusCommand::loadSequences(string group){
         try {
-               
-               vector<Sequence> thisGroupsSeqs = parser.getSeqs(group);
-               map<string, string> nameMap = parser.getNameMap(group);
-               map<string, string>::iterator it;
-               
-               vector<seqData> sequences;
-               bool error = false;
-        alignLength = 0;
-               
-               for (int i = 0; i < thisGroupsSeqs.size(); i++) {
-               
-                       if (m->control_pressed) {  return sequences; }
-                       
-                       it = nameMap.find(thisGroupsSeqs[i].getName());
-                       if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
-                       else {
-                               int num = m->getNumNames(it->second);
-                               sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
-                if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
-                       }
+        bool error = false;
+               alignLength = 0;
+        vector<seqData> sequences;
+        if (hasCount) {
+            vector<Sequence> thisGroupsSeqs = cparser->getSeqs(group);
+            map<string, int> counts = cparser->getCountTable(group);
+            map<string, int>::iterator it;
+            
+            for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+                
+                if (m->control_pressed) {  return sequences; }
+                
+                it = counts.find(thisGroupsSeqs[i].getName());
+                if (it == counts.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); m->mothurOutEndLine(); }
+                else {
+                    sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second));
+                    if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
+                }
+            }
+        }else{
+            vector<Sequence> thisGroupsSeqs = parser->getSeqs(group);
+            map<string, string> nameMap = parser->getNameMap(group);
+            map<string, string>::iterator it;
+           
+            for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+                
+                if (m->control_pressed) {  return sequences; }
+                
+                it = nameMap.find(thisGroupsSeqs[i].getName());
+                if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
+                else {
+                    int num = m->getNumNames(it->second);
+                    sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
+                    if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
+                }
+            }
+            
                 }
                 
-               if (error) { m->control_pressed = true; }
-               
+        if (error) { m->control_pressed = true; }
                 //sort by frequency
                 sort(sequences.rbegin(), sequences.rend());
                 
@@ -619,6 +766,37 @@ vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, string name){
                 
                 return sequences;
         }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraPerseusCommand", "readFiles");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, CountTable* ct){
+       try {           
+               //read fasta file and create sequenceData structure - checking for file mismatches
+               vector<seqData> sequences;
+               ifstream in;
+               m->openInputFile(inputFile, in);
+               alignLength = 0;
+        
+               while (!in.eof()) {
+            Sequence temp(in); m->gobble(in);
+                       
+                       int count = ct->getNumSeqs(temp.getName());
+                       if (m->control_pressed) { break; }
+                       else {
+                               sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), count));
+                if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); }
+                       }
+               }
+               in.close();
+               
+               //sort by frequency
+               sort(sequences.rbegin(), sequences.rend());
+               
+               return sequences;
+       }
         catch(exception& e) {
                 m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile");
                 exit(1);
@@ -771,7 +949,7 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector<seqData>& seque
         }
  }
  /**************************************************************************************************/
-int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string accnos, vector<string> groups, string group, string fasta, string name) {
+int ChimeraPerseusCommand::createProcessesGroups(string outputFName, string accnos, vector<string> groups, string group, string fasta, string name) {
         try {
                 
                 vector<int> processIDS;
@@ -801,7 +979,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string
                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
                                 process++;
                         }else if (pid == 0){
-                               num = driverGroups(parser, outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups);
+                               num = driverGroups(outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups);
                                 
                                 //pass numSeqs to parent
                                 ofstream out;
@@ -819,7 +997,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string
                 }
                 
                 //do my part
-               num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups);
+               num = driverGroups(outputFName, accnos, lines[0].start, lines[0].end, groups);
                 
                 //force parent to wait until all the processes are done
                 for (int i=0;i<processIDS.size();i++) { 
@@ -850,7 +1028,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string
                         // Allocate memory for thread data.
                         string extension = toString(i) + ".temp";
                         
-                       perseusData* tempPerseus = new perseusData(alpha, beta, cutoff, outputFName+extension, fasta, name, group, accnos+extension, groups, m, lines[i].start, lines[i].end, i);
+                       perseusData* tempPerseus = new perseusData(hasName, hasCount, alpha, beta, cutoff, outputFName+extension, fasta, name, group, accnos+extension, groups, m, lines[i].start, lines[i].end, i);
                         
                         pDataArray.push_back(tempPerseus);
                         processIDS.push_back(i);
@@ -862,7 +1040,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string
                 
                 
                 //using the main process as a worker saves time and memory
-               num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups);
+               num = driverGroups(outputFName, accnos, lines[0].start, lines[0].end, groups);
                 
                 //Wait until all threads have terminated.
                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
@@ -894,9 +1072,8 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string
         }
  }
  //**********************************************************************************************************************
-int ChimeraPerseusCommand::deconvoluteResults(SequenceParser& parser, string outputFileName, string accnosFileName){
+int ChimeraPerseusCommand::deconvoluteResults(map<string, string>& uniqueNames, string outputFileName, string accnosFileName){
         try {
-               map<string, string> uniqueNames = parser.getAllSeqsMap();
                 map<string, string>::iterator itUnique;
                 int total = 0;
                 
diff --git a/chimeraperseuscommand.h b/chimeraperseuscommand.h

index 1c4baa3f8f0d5a8b5724af661f51850b6d0b4762..3608835cd16ed30f648469f308d2320f23bf423e 100644 (file)
--- a/chimeraperseuscommand.h
+++ b/chimeraperseuscommand.h
@@ -16,7 +16,9 @@
  #include "mothur.h"
  #include "command.hpp"
  #include "sequenceparser.h"
+#include "sequencecountparser.h"
  #include "myPerseus.h"
+#include "counttable.h"
  
  /***********************************************************/
  class ChimeraPerseusCommand : public Command {
@@ -43,10 +45,12 @@ private:
                 linePair(int i, int j) : start(i), end(j) {}
         };
         
-       bool abort;
-       string fastafile, groupfile, outputDir, namefile;
+       bool abort, hasName, hasCount;
+       string fastafile, groupfile, countfile, outputDir, namefile;
         int processors, alignLength;
         double cutoff, alpha, beta;
+    SequenceParser* parser;
+    SequenceCountParser* cparser;
         
         vector<string> outputNames;
         vector<string> fastaFileNames;
@@ -56,10 +60,11 @@ private:
         string getNamesFile(string&);
         int driver(string, vector<seqData>&, string, int&);
         vector<seqData> readFiles(string, string);
-       vector<seqData> loadSequences(SequenceParser&, string);
-       int deconvoluteResults(SequenceParser&, string, string);
-       int driverGroups(SequenceParser&, string, string, int, int, vector<string>);
-       int createProcessesGroups(SequenceParser&, string, string, vector<string>, string, string, string);
+    vector<seqData> readFiles(string inputFile, CountTable* ct);
+       vector<seqData> loadSequences(string);
+       int deconvoluteResults(map<string, string>&, string, string);
+       int driverGroups(string, string, int, int, vector<string>);
+       int createProcessesGroups(string, string, vector<string>, string, string, string);
  };
  
  /**************************************************************************************************/
@@ -75,12 +80,13 @@ struct perseusData {
         MothurOut* m;
         int start;
         int end;
+    bool hasName, hasCount;
         int threadID, count, numChimeras;
         double alpha, beta, cutoff;
         vector<string> groups;
         
         perseusData(){}
-       perseusData(double a, double b, double c, string o,  string f, string n, string g, string ac, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
+       perseusData(bool hn, bool hc, double a, double b, double c, string o,  string f, string n, string g, string ac, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
                 alpha = a;
                 beta = b;
                 cutoff = c;
@@ -94,6 +100,8 @@ struct perseusData {
                 end = en;
                 threadID = tid;
                 groups = gr;
+        hasName = hn;
+        hasCount = hc;
                 count = 0;
                 numChimeras = 0;
         }
@@ -114,38 +122,67 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                 
                 //parse fasta and name file by group
                 SequenceParser* parser;
-               if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);      }
-               else                                                    { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
-               
+        SequenceCountParser* cparser;
+               if (pDataArray->hasCount) {
+            CountTable* ct = new CountTable();
+            ct->readTable(pDataArray->namefile);
+            cparser = new SequenceCountParser(pDataArray->fastafile, *ct);
+            delete ct;
+        }else {
+            if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); }
+            else                                                       { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
+        }
+    
                 int totalSeqs = 0;
                 int numChimeras = 0;
                 
                 for (int i = pDataArray->start; i < pDataArray->end; i++) {
                         
-                       int start = time(NULL);  if (pDataArray->m->control_pressed) {  delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+                       int start = time(NULL);  if (pDataArray->m->control_pressed) {  if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
                         
                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[i] + "...");  pDataArray->m->mothurOutEndLine();                                      
                         
                         //vector<seqData> sequences = loadSequences(parser, groups[i]); - same function below
                         ////////////////////////////////////////////////////////////////////////////////////////
-                       vector<Sequence> thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]);
-                       map<string, string> nameMap = parser->getNameMap(pDataArray->groups[i]);
-                       map<string, string>::iterator it;
-                       
-                       vector<seqData> sequences;
                         bool error = false;
-                       
-                       for (int j = 0; j < thisGroupsSeqs.size(); j++) {
-                               
-                               if (pDataArray->m->control_pressed) {  delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
-                               
-                               it = nameMap.find(thisGroupsSeqs[j].getName());
-                               if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[j].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
-                               else {
-                                       int num = pDataArray->m->getNumNames(it->second);
-                                       sequences.push_back(seqData(thisGroupsSeqs[j].getName(), thisGroupsSeqs[j].getUnaligned(), num));
-                               }
-                       }
+            int alignLength = 0;
+            vector<seqData> sequences;
+            if (pDataArray->hasCount) {
+                vector<Sequence> thisGroupsSeqs = cparser->getSeqs(pDataArray->groups[i]);
+                map<string, int> counts = cparser->getCountTable(pDataArray->groups[i]);
+                map<string, int>::iterator it;
+                
+                for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+                    
+                    if (pDataArray->m->control_pressed) {  break; }
+                    
+                    it = counts.find(thisGroupsSeqs[i].getName());
+                    if (it == counts.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
+                    else {
+                        sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second));
+                        if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
+                    }
+                }
+            }else{
+                vector<Sequence> thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]);
+                map<string, string> nameMap = parser->getNameMap(pDataArray->groups[i]);
+                map<string, string>::iterator it;
+                
+                for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+                    
+                    if (pDataArray->m->control_pressed) {  break; }
+                    
+                    it = nameMap.find(thisGroupsSeqs[i].getName());
+                    if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+                    else {
+                        int num = pDataArray->m->getNumNames(it->second);
+                        sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
+                        if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
+                    }
+                }
+                
+            }
+            
                         
                         if (error) { pDataArray->m->control_pressed = true; }
                         
@@ -153,7 +190,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                         sort(sequences.rbegin(), sequences.rend());
                         ////////////////////////////////////////////////////////////////////////////////////////
  
-                       if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
                         
                         //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below
                         ////////////////////////////////////////////////////////////////////////////////////////
@@ -184,7 +221,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                         }
                         
                         int numSeqs = sequences.size();
-                       int alignLength = sequences[0].sequence.size();
+                       //int alignLength = sequences[0].sequence.size();
                         
                         ofstream chimeraFile;
                         ofstream accnosFile;
@@ -200,7 +237,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                         
                         for(int j=0;j<numSeqs;j++){     
                                 
-                               if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                               if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                 
                                 vector<bool> restricted = chimeras;
                                 
@@ -217,7 +254,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                                 
                                 int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
                                 
-                               if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                               if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                 
                                 int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
                                 
@@ -226,7 +263,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                                 if(comparisons >= 2){   
                                         minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
                                         
-                                       if (pDataArray->m->control_pressed) { delete parser;  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                         
                                         int minMismatchToTrimera = numeric_limits<int>::max();
                                         int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
@@ -234,12 +271,12 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                                         if(minMismatchToChimera >= 3 && comparisons >= 3){
                                                 minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
                                                 
-                                               if (pDataArray->m->control_pressed) { delete parser;  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                                               if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                         }
                                         
                                         double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
                                         
-                                       if (pDataArray->m->control_pressed) { delete parser;  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                         
                                         string type;
                                         string chimeraRefSeq;
@@ -253,16 +290,16 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                                                 chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
                                         }
                                         
-                                       if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                         
                                         double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
                                         
-                                       if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                         
                                         double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq);
                                         double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);                
                                         
-                                       if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                         
                                         chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
                                         chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t';
@@ -304,11 +341,11 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                         pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName);
                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[i] + ".");        pDataArray->m->mothurOutEndLine();                                      
                         
-                       if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
                 }       
                 
                 pDataArray->count = totalSeqs;
-               delete parser;
+               if (pDataArray->hasCount) { delete cparser; } { delete parser; }
                 return totalSeqs;
                 
         }
diff --git a/clustercommand.cpp b/clustercommand.cpp

index 5a469963875ab3673e4d9b892e9ecf472f5af09e..06e627a615dacf69fe80464db0123582bef510ed 100644 (file)
--- a/clustercommand.cpp
+++ b/clustercommand.cpp
@@ -154,6 +154,14 @@ ClusterCommand::ClusterCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         //check for required parameters
diff --git a/counttable.cpp b/counttable.cpp

index a664228b17c35fff4ce48ca0ea5255b5b99bb1f3..005ab8a2846bcd95dc40d1bf1d9b86325deb5e85 100644 (file)
--- a/counttable.cpp
+++ b/counttable.cpp
@@ -174,6 +174,75 @@ int CountTable::get(string seqName) {
                 exit(1);
         }
  }
+/************************************************************/
+//add seqeunce without group info
+int CountTable::push_back(string seqName) {
+    try {
+        map<string, int>::iterator it = indexNameMap.find(seqName);
+        if (it == indexNameMap.end()) {
+            if (hasGroups) {  m->mothurOut("[ERROR]: Your count table has groups and I have no group information for " + seqName + "."); m->mothurOutEndLine(); m->control_pressed = true;  }
+            indexNameMap[seqName] = uniques;
+            totals.push_back(1);
+            total++;
+            uniques++;
+        }else {
+            m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "push_back");
+               exit(1);
+       }
+}
+/************************************************************/
+//add seqeunce without group info
+int CountTable::push_back(string seqName, int thisTotal) {
+    try {
+        map<string, int>::iterator it = indexNameMap.find(seqName);
+        if (it == indexNameMap.end()) {
+            if (hasGroups) {  m->mothurOut("[ERROR]: Your count table has groups and I have no group information for " + seqName + "."); m->mothurOutEndLine(); m->control_pressed = true;  }
+            indexNameMap[seqName] = uniques;
+            totals.push_back(thisTotal);
+            total+=thisTotal;
+            uniques++;
+        }else {
+            m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "push_back");
+               exit(1);
+       }
+}
+/************************************************************/
+//add sequence with group info
+int CountTable::push_back(string seqName, vector<int> groupCounts) {
+    try {
+        map<string, int>::iterator it = indexNameMap.find(seqName);
+        if (it == indexNameMap.end()) {
+            if ((hasGroups) && (groupCounts.size() != getNumGroups())) {  m->mothurOut("[ERROR]: Your count table has a " + toString(getNumGroups()) + " groups and " + seqName + " has " + toString(groupCounts.size()) + ", please correct."); m->mothurOutEndLine(); m->control_pressed = true;  }
+            int thisTotal = 0;
+            for (int i = 0; i < getNumGroups(); i++) {   totalGroups[i] += groupCounts[i];  thisTotal += groupCounts[i]; }
+            indexNameMap[seqName] = uniques;
+            totals.push_back(thisTotal);
+            total+= thisTotal;
+            uniques++;
+        }else {
+            m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "push_back");
+               exit(1);
+       }
+}
+
  /************************************************************/
  //create ListVector from uniques
  ListVector CountTable::getListVector() {
diff --git a/counttable.h b/counttable.h

index 8baff3080a5f5124214e0b6b6834a899b356f0f9..adef8530e9d6e7d07ce16c23a3524c16f962d0b4 100644 (file)
--- a/counttable.h
+++ b/counttable.h
@@ -51,6 +51,12 @@ class CountTable {
          bool hasGroupInfo() { return hasGroups; }
          int getNumGroups() { return groups.size(); }
          vector<string> getNamesOfGroups() {  return groups;   }  //returns group names, if no group info vector is blank.
+        
+        int push_back(string); //add a sequence 
+        int push_back(string, int); //add a sequence 
+        int push_back(string, vector<int>); //add a sequence with group info
+        int get(string); //returns unique sequence index for reading distance matrices like NameAssignment
+        int size() { return indexNameMap.size(); }
      
          vector<int> getGroupCounts(string);  //returns group counts for a seq passed in, if no group info is in file vector is blank. Order is the same as the groups returned by getGroups function.
          int getGroupCount(string, string); //returns number of seqs for that group for that seq
@@ -59,11 +65,10 @@ class CountTable {
          int getNumSeqs() { return total; } //return total number of seqs
          int getNumUniqueSeqs() { return uniques; } //return number of unique/representative seqs
          int getGroupIndex(string); //returns index in getGroupCounts vector of specific group
+    
          vector<string> getNamesOfSeqs();
          int mergeCounts(string, string); //combines counts for 2 seqs, saving under the first name passed in.
-        int get(string); //returns unique sequence index for reading distance matrices like NameAssignment
          ListVector getListVector();
-        int size() { return indexNameMap.size(); }
      
      private:
          string filename;
diff --git a/readcluster.cpp b/readcluster.cpp

index b6cb71de5fdbb888f4a7c1614987f74cd0bf8b1a..a6adabb5b8aa155d3b633f885fd37f7ba051465f 100644 (file)
--- a/readcluster.cpp
+++ b/readcluster.cpp
@@ -42,6 +42,26 @@ int ReadCluster::read(NameAssignment*& nameMap){
         }
  }
  /***********************************************************************/
+int ReadCluster::read(CountTable*& ct){
+       try {
+        
+               if (format == "phylip") { convertPhylip2Column(ct); }
+               else { list = new ListVector(ct->getListVector());  }
+               
+               if (m->control_pressed) { return 0; }
+               
+               if (sortWanted) {  OutPutFile = m->sortFile(distFile, outputDir);  }
+               else {  OutPutFile = distFile;   } //for use by clusters splitMatrix to convert a phylip matrix to column
+               
+               return 0;
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ReadCluster", "read");
+               exit(1);
+       }
+}
+/***********************************************************************/
  
  int ReadCluster::convertPhylip2Column(NameAssignment*& nameMap){
         try {   
@@ -224,6 +244,181 @@ int ReadCluster::convertPhylip2Column(NameAssignment*& nameMap){
  }
  /***********************************************************************/
  
+int ReadCluster::convertPhylip2Column(CountTable*& ct){
+       try {   
+               //convert phylip file to column file
+               map<int, string> rowToName;
+               map<int, string>::iterator it;
+               
+               ifstream in;
+               ofstream out;
+               string tempFile = distFile + ".column.temp";
+               
+               m->openInputFile(distFile, in);  m->gobble(in);
+               m->openOutputFile(tempFile, out);
+               
+               float distance;
+               int square, nseqs;
+               string name;
+               vector<string> matrixNames;
+               
+               string numTest;
+               in >> numTest >> name;
+               
+               if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
+               else { convert(numTest, nseqs); }
+               
+               rowToName[0] = name;
+               matrixNames.push_back(name);
+               
+               if(ct == NULL){
+                       list = new ListVector(nseqs);
+                       list->set(0, name);
+               }
+               else{  list = new ListVector(ct->getListVector()); }
+        
+               char d;
+               while((d=in.get()) != EOF){
+                       
+                       if(isalnum(d)){
+                               square = 1;
+                               in.putback(d);
+                               for(int i=0;i<nseqs;i++){
+                                       in >> distance;
+                               }
+                               break;
+                       }
+                       if(d == '\n'){
+                               square = 0;
+                               break;
+                       }
+               }
+        
+               if(square == 0){
+            
+                       for(int i=1;i<nseqs;i++){
+                               in >> name;
+                               rowToName[i] = name;
+                               matrixNames.push_back(name);
+                               
+                               //there's A LOT of repeated code throughout this method...
+                               if(ct == NULL){
+                                       list->set(i, name);
+                                       
+                                       for(int j=0;j<i;j++){
+                        
+                                               if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; }
+                                               
+                                               in >> distance;
+                                               
+                                               if (distance == -1) { distance = 1000000; }
+                                               
+                                               if(distance < cutoff){
+                                                       out << i << '\t' << j << '\t' << distance << endl;
+                                               }
+                                       }
+                                       
+                               }
+                               else{
+                                       
+                                       for(int j=0;j<i;j++){
+                                               
+                                               if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; }
+                                               
+                                               in >> distance;
+                                               
+                                               if (distance == -1) { distance = 1000000; }
+                                               
+                                               if(distance < cutoff){
+                                                       out << i << '\t' << j << '\t' << distance << endl;
+                                               }
+                                       }
+                               }
+                       }
+               }
+               else{
+                       for(int i=1;i<nseqs;i++){
+                               in >> name;                
+                               rowToName[i] = name;
+                               matrixNames.push_back(name);
+                
+                               if(ct == NULL){
+                                       list->set(i, name);
+                                       for(int j=0;j<nseqs;j++){
+                                               if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; }
+                                               
+                                               in >> distance;
+                        
+                                               if (distance == -1) { distance = 1000000; }
+                                               
+                                               if(distance < cutoff && j < i){
+                                                       out << i << '\t' << j << '\t' << distance << endl;
+                                               }
+                                       }
+                               }
+                               else{
+                                       for(int j=0;j<nseqs;j++){
+                                               if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; }
+                                               
+                                               in >> distance;
+                        
+                                               if (distance == -1) { distance = 1000000; }
+                                               
+                                               if(distance < cutoff && j < i){
+                                                       out << i << '\t' << j << '\t' << distance << endl;
+                                               }
+                                               
+                                       }
+                               }
+                       }
+               }
+               
+               list->setLabel("0");
+               in.close();
+               out.close();
+        
+               if(ct == NULL){
+                       ct = new CountTable();
+                       for(int i=0;i<matrixNames.size();i++){
+                               ct->push_back(matrixNames[i]);
+                       }
+               }
+               
+        
+               ifstream in2;
+               ofstream out2;
+               
+               string outputFile = m->getRootName(distFile) + "column.dist";
+               m->openInputFile(tempFile, in2);
+               m->openOutputFile(outputFile, out2);
+               
+               int first, second;
+               float dist;
+               
+               while (in2) {
+                       if (m->control_pressed) { in2.close(); out2.close(); m->mothurRemove(tempFile); m->mothurRemove(outputFile); return 0; }
+                       
+                       in2 >> first >> second >> dist;
+                       out2 << rowToName[first] << '\t' << rowToName[second] << '\t' << dist << endl;
+                       m->gobble(in2);
+               }
+               in2.close();
+               out2.close();
+               
+               m->mothurRemove(tempFile);
+               distFile = outputFile;
+        
+               if (m->control_pressed) {  m->mothurRemove(outputFile);  }
+        
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ReadCluster", "convertPhylip2Column");
+               exit(1);
+       }
+}
+/***********************************************************************/
+
  ReadCluster::~ReadCluster(){}
  /***********************************************************************/
  
diff --git a/readcluster.h b/readcluster.h

index a838dace17f35547421c03b0dfc8dcc6efdc2317..7ea579c2fc1a250bba3500f7d6dc59d3ebed26f8 100644 (file)
--- a/readcluster.h
+++ b/readcluster.h
@@ -13,6 +13,7 @@
  #include "mothur.h"
  #include "nameassignment.hpp"
  #include "listvector.hpp"
+#include "counttable.h"
  
  
  /******************************************************/
@@ -23,6 +24,7 @@ public:
         ReadCluster(string, float, string, bool);
         ~ReadCluster();
         int read(NameAssignment*&);
+    int read(CountTable*&);
         string getOutputFile() { return OutPutFile; }
         void setFormat(string f) { format = f;  }
         ListVector* getListVector()             {       return list;    }
@@ -36,6 +38,7 @@ private:
         bool sortWanted;
         
         int convertPhylip2Column(NameAssignment*&);
+    int convertPhylip2Column(CountTable*&);
  };
  
  /******************************************************/
diff --git a/sequencecountparser.cpp b/sequencecountparser.cpp

new file mode 100644 (file)

index 0000000..4ba6091
--- /dev/null
+++ b/sequencecountparser.cpp
@@ -0,0 +1,289 @@
+//
+//  sequencecountparser.cpp
+//  Mothur
+//
+//  Created by Sarah Westcott on 8/7/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "sequencecountparser.h"
+
+/************************************************************/
+SequenceCountParser::SequenceCountParser(string countfile, string fastafile) {
+       try {
+               
+               m = MothurOut::getInstance();
+               
+               //read count file
+               CountTable countTable;
+               countTable.readTable(countfile);
+               
+               //initialize maps
+               namesOfGroups = countTable.getNamesOfGroups();
+               for (int i = 0; i < namesOfGroups.size(); i++) {
+                       vector<Sequence> temp;
+                       map<string, int> tempMap;
+                       seqs[namesOfGroups[i]] = temp;
+                       countTablePerGroup[namesOfGroups[i]] = tempMap;
+               }
+               
+               //read fasta file making sure each sequence is in the group file
+               ifstream in;
+               m->openInputFile(fastafile, in);
+               
+        int fastaCount = 0;
+               while (!in.eof()) {
+                       
+                       if (m->control_pressed) { break; }
+                       
+                       Sequence seq(in); m->gobble(in);
+            fastaCount++;
+            if (m->debug) { if((fastaCount) % 1000 == 0){      m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
+                       
+            if (seq.getName() != "") {
+                               
+                allSeqsMap[seq.getName()] = seq.getName();
+                vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
+                
+                for (int i = 0; i < namesOfGroups.size(); i++) {
+                    if (groupCounts[i] != 0) {
+                        seqs[namesOfGroups[i]].push_back(seq); 
+                        countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
+                    }
+                }
+                       }
+               }
+               in.close();                                     
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "SequenceCountParser");
+               exit(1);
+       }
+}
+/************************************************************/
+SequenceCountParser::SequenceCountParser(string fastafile, CountTable& countTable) {
+       try {
+               
+               m = MothurOut::getInstance();
+                               
+               //initialize maps
+        if (countTable.hasGroupInfo()) {
+            namesOfGroups = countTable.getNamesOfGroups();
+            for (int i = 0; i < namesOfGroups.size(); i++) {
+                vector<Sequence> temp;
+                map<string, int> tempMap;
+                seqs[namesOfGroups[i]] = temp;
+                countTablePerGroup[namesOfGroups[i]] = tempMap;
+            }
+            
+            //read fasta file making sure each sequence is in the group file
+            ifstream in;
+            m->openInputFile(fastafile, in);
+            
+            int fastaCount = 0;
+            while (!in.eof()) {
+                
+                if (m->control_pressed) { break; }
+                
+                Sequence seq(in); m->gobble(in);
+                fastaCount++;
+                if (m->debug) { if((fastaCount) % 1000 == 0){  m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
+                
+                if (seq.getName() != "") {
+                    
+                    allSeqsMap[seq.getName()] = seq.getName();
+                    vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
+                    
+                    for (int i = 0; i < namesOfGroups.size(); i++) {
+                        if (groupCounts[i] != 0) {
+                            seqs[namesOfGroups[i]].push_back(seq);     
+                            countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
+                        }
+                    }
+                }
+            }
+            in.close();        
+        }else {  m->control_pressed = true;  m->mothurOut("[ERROR]: cannot parse fasta file by group with a count table that does not include group data, please correct.\n"); }
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "SequenceCountParser");
+               exit(1);
+       }
+}
+/************************************************************/
+SequenceCountParser::~SequenceCountParser(){  }
+/************************************************************/
+int SequenceCountParser::getNumGroups(){ return namesOfGroups.size(); }
+/************************************************************/
+vector<string> SequenceCountParser::getNamesOfGroups(){ return namesOfGroups; }
+/************************************************************/
+int SequenceCountParser::getNumSeqs(string g){ 
+       try {
+               map<string, vector<Sequence> >::iterator it;
+               int num = 0;
+               
+               it = seqs.find(g);
+               if(it == seqs.end()) {
+                       m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
+               }else {
+                       num = (it->second).size();
+               }
+               
+               return num; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "getNumSeqs");
+               exit(1);
+       }
+}
+/************************************************************/
+vector<Sequence> SequenceCountParser::getSeqs(string g){ 
+       try {
+               map<string, vector<Sequence> >::iterator it;
+               vector<Sequence> seqForThisGroup;
+               
+               it = seqs.find(g);
+               if(it == seqs.end()) {
+                       m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       seqForThisGroup = it->second;
+            if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences.");  }
+               }
+               
+               return seqForThisGroup; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "getSeqs");
+               exit(1);
+       }
+}
+/************************************************************/
+int SequenceCountParser::getSeqs(string g, string filename, bool uchimeFormat=false){ 
+       try {
+               map<string, vector<Sequence> >::iterator it;
+               vector<Sequence> seqForThisGroup;
+               vector<seqPriorityNode> nameVector;
+               
+               it = seqs.find(g);
+               if(it == seqs.end()) {
+                       m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       
+                       ofstream out;
+                       m->openOutputFile(filename, out);
+                       
+                       seqForThisGroup = it->second;
+                       
+                       if (uchimeFormat) {
+                               // format should look like 
+                               //>seqName /ab=numRedundantSeqs/
+                               //sequence
+                               
+                               map<string, int> countForThisGroup = getCountTable(g);
+                               map<string, int>::iterator itCount;
+                               int error = 0;
+                               
+                               for (int i = 0; i < seqForThisGroup.size(); i++) {
+                                       itCount = countForThisGroup.find(seqForThisGroup[i].getName());
+                                       
+                                       if (itCount == countForThisGroup.end()){
+                                               error = 1;
+                                               m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your count file, please correct."); m->mothurOutEndLine();
+                                       }else {
+                        seqPriorityNode temp(itCount->second, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
+                                               nameVector.push_back(temp);
+                                       }
+                               }
+                               
+                               if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
+                               
+                               //sort by num represented
+                               sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
+                
+                               //print new file in order of
+                               for (int i = 0; i < nameVector.size(); i++) {
+                                       
+                                       if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+                                       
+                                       out << ">" << nameVector[i].name  << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl;
+                               }
+                               
+                       }else { 
+                //m->mothurOut("Group " + g +  " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n");
+                               for (int i = 0; i < seqForThisGroup.size(); i++) {
+                                       
+                                       if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+                                       
+                                       seqForThisGroup[i].printSequence(out);  
+                               }
+                       }
+                       out.close();
+               }
+               
+               return 0; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "getSeqs");
+               exit(1);
+       }
+}
+
+/************************************************************/
+map<string, int> SequenceCountParser::getCountTable(string g){ 
+       try {
+               map<string, map<string, int> >::iterator it;
+               map<string, int> countForThisGroup;
+               
+               it = countTablePerGroup.find(g);
+               if(it == countTablePerGroup.end()) {
+                       m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       countForThisGroup = it->second;
+            if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " count file has " + toString(countForThisGroup.size()) + " unique sequences.");  }
+               }
+               
+               return countForThisGroup; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "getCountTable");
+               exit(1);
+       }
+}
+/************************************************************/
+int SequenceCountParser::getCountTable(string g, string filename){ 
+       try {
+               map<string, map<string, int> >::iterator it;
+               map<string, int> countForThisGroup;
+               
+               it = countTablePerGroup.find(g);
+               if(it == countTablePerGroup.end()) {
+                       m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       countForThisGroup = it->second;
+                       
+                       ofstream out;
+                       m->openOutputFile(filename, out);
+            out << "Representative_Sequence\ttotal\t" << g << endl;
+            
+                       for (map<string, int>::iterator itFile = countForThisGroup.begin(); itFile != countForThisGroup.end(); itFile++) {
+                               
+                               if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+                               
+                               out << itFile->first << '\t' << itFile->second << '\t' << itFile->second << endl;
+                       }
+                       
+                       out.close();
+               }
+               
+               return 0; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceParser", "getCountTable");
+               exit(1);
+       }
+}
+/************************************************************/
+
+
+
diff --git a/sequencecountparser.h b/sequencecountparser.h

new file mode 100644 (file)

index 0000000..4889ea6
--- /dev/null
+++ b/sequencecountparser.h
@@ -0,0 +1,59 @@
+#ifndef Mothur_sequencecountparser_h
+#define Mothur_sequencecountparser_h
+
+//
+//  sequencecountparser.h
+//  Mothur
+//
+//  Created by Sarah Westcott on 8/7/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "mothur.h"
+#include "mothurout.h"
+#include "sequence.hpp"
+#include "counttable.h"
+
+/* This class reads a fasta and count file and parses the data by group. The countfile must contain group information.
+ 
+ Note: The sum of all the groups unique sequences will be larger than the original number of unique sequences. 
+ This is because when we parse the count file we make a unique for each group instead of 1 unique for all
+ groups. 
+ 
+ */
+
+class SequenceCountParser {
+       
+public:
+       
+    SequenceCountParser(string, string);                       //count, fasta - file mismatches will set m->control_pressed = true
+    SequenceCountParser(string, CountTable&);          //fasta, counttable - file mismatches will set m->control_pressed = true
+    ~SequenceCountParser();
+    
+    //general operations
+    int getNumGroups();
+    vector<string> getNamesOfGroups(); 
+    
+    int getNumSeqs(string);            //returns the number of unique sequences in a specific group
+    vector<Sequence> getSeqs(string); //returns unique sequences in a specific group
+    map<string, int> getCountTable(string); //returns seqName -> numberOfRedundantSeqs for a specific group - the count file format, but each line is parsed by group.
+    
+    int getSeqs(string, string, bool); //prints unique sequences in a specific group to a file - group, filename, uchimeFormat=false
+    int getCountTable(string, string); //print seqName -> numberRedundantSeqs for a specific group - group, filename
+    
+    map<string, string> getAllSeqsMap(){ return allSeqsMap; }  //returns map where the key=sequenceName and the value=representativeSequence - helps us remove duplicates after group by group processing
+private:
+       
+    CountTable countTable;
+    MothurOut* m;
+       
+    int numSeqs;
+    map<string, string> allSeqsMap;
+    map<string, vector<Sequence> > seqs; //a vector for each group
+    map<string, map<string, int> > countTablePerGroup; //countTable for each group
+    vector<string> namesOfGroups;
+};
+
+
+
+#endif
diff --git a/sequenceparser.cpp b/sequenceparser.cpp

index 2aff07ebad9eb0b642a17d59289704b2c85ec9ac..63010f346be2d473e86aa4b8c6a8feeecbaf7cca 100644 (file)
--- a/sequenceparser.cpp
+++ b/sequenceparser.cpp
@@ -310,8 +310,6 @@ vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGr
  /************************************************************/
  bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
  /************************************************************/
-string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
-/************************************************************/
  int SequenceParser::getNumSeqs(string g){ 
         try {
                 map<string, vector<Sequence> >::iterator it;
diff --git a/sequenceparser.h b/sequenceparser.h

index 23fcb9ecc983e1cf59ef1fde1f976da80773a152..98438f648b6f61d4c89192f7333aedfccfeced36 100644 (file)
--- a/sequenceparser.h
+++ b/sequenceparser.h
@@ -36,7 +36,6 @@ class SequenceParser {
                 int getNumGroups();
                 vector<string> getNamesOfGroups();      
                 bool isValidGroup(string);  //return true if string is a valid group
-               string getGroup(string);        //returns group of a specific sequence
                 
                 int getNumSeqs(string);         //returns the number of unique sequences in a specific group
                 vector<Sequence> getSeqs(string); //returns unique sequences in a specific group
author	Sarah Westcott <mothur.westcott@gmail.com>
	Wed, 8 Aug 2012 15:07:46 +0000 (11:07 -0400)
committer	Sarah Westcott <mothur.westcott@gmail.com>
	Wed, 8 Aug 2012 15:07:46 +0000 (11:07 -0400)
Mothur.xcodeproj/project.pbxproj		patch \| blob \| history
chimeraperseuscommand.cpp		patch \| blob \| history
chimeraperseuscommand.h		patch \| blob \| history
clustercommand.cpp		patch \| blob \| history
counttable.cpp		patch \| blob \| history
counttable.h		patch \| blob \| history
readcluster.cpp		patch \| blob \| history
readcluster.h		patch \| blob \| history
sequencecountparser.cpp	[new file with mode: 0644]	patch \| blob
sequencecountparser.h	[new file with mode: 0644]	patch \| blob
sequenceparser.cpp		patch \| blob \| history
sequenceparser.h		patch \| blob \| history