From c7e8c2d15bd7cedcfdf18675cb0ea1a0dcd0e3c0 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Wed, 8 Aug 2012 11:07:46 -0400 Subject: [PATCH] added SequenceCountParser class to parse the count table by group. added count parameter to chimera.perseus. --- Mothur.xcodeproj/project.pbxproj | 6 + chimeraperseuscommand.cpp | 343 +++++++++++++++++++++++-------- chimeraperseuscommand.h | 117 +++++++---- clustercommand.cpp | 8 + counttable.cpp | 69 +++++++ counttable.h | 9 +- readcluster.cpp | 195 ++++++++++++++++++ readcluster.h | 3 + sequencecountparser.cpp | 289 ++++++++++++++++++++++++++ sequencecountparser.h | 59 ++++++ sequenceparser.cpp | 2 - sequenceparser.h | 1 - 12 files changed, 973 insertions(+), 128 deletions(-) create mode 100644 sequencecountparser.cpp create mode 100644 sequencecountparser.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index ff0e35b..104c6d5 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -24,6 +24,7 @@ A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73901071588C40900ED2ED6 /* loadlogfilecommand.cpp */; }; A73DDBBA13C4A0D1006AAE38 /* clearmemorycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */; }; A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */; }; + A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */; }; A74A9A9F148E881E00AB5E3E /* spline.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74A9A9E148E881E00AB5E3E /* spline.cpp */; }; A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; }; A74D59A4159A1E2000043046 /* counttable.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D59A3159A1E2000043046 /* counttable.cpp */; }; @@ -397,6 +398,8 @@ A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clearmemorycommand.cpp; sourceTree = ""; }; A73DDC3613C4BF64006AAE38 /* mothurmetastats.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mothurmetastats.h; sourceTree = ""; }; A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mothurmetastats.cpp; sourceTree = ""; }; + A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequencecountparser.cpp; sourceTree = ""; }; + A741FAD415D168A00067BCC5 /* sequencecountparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequencecountparser.h; sourceTree = ""; }; A74A9A9D148E881E00AB5E3E /* spline.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = spline.h; sourceTree = ""; }; A74A9A9E148E881E00AB5E3E /* spline.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = spline.cpp; sourceTree = ""; }; A74D36B6137DAFAA00332B0C /* chimerauchimecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerauchimecommand.h; sourceTree = ""; }; @@ -1659,6 +1662,8 @@ A7E9B7D012D37EC400DA6239 /* sabundvector.hpp */, A7E9B7DB12D37EC400DA6239 /* sequence.cpp */, A7E9B7DC12D37EC400DA6239 /* sequence.hpp */, + A741FAD415D168A00067BCC5 /* sequencecountparser.h */, + A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */, A7E9B7DD12D37EC400DA6239 /* sequencedb.cpp */, A7E9B7DE12D37EC400DA6239 /* sequencedb.h */, A7F9F5CD141A5E500032F693 /* sequenceparser.h */, @@ -2192,6 +2197,7 @@ A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */, A74D59A4159A1E2000043046 /* counttable.cpp in Sources */, A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */, + A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp index e3691e8..0955f2d 100644 --- a/chimeraperseuscommand.cpp +++ b/chimeraperseuscommand.cpp @@ -10,12 +10,15 @@ #include "chimeraperseuscommand.h" #include "deconvolutecommand.h" #include "sequence.hpp" +#include "counttable.h" +#include "sequencecountparser.h" //********************************************************************************************************************** vector ChimeraPerseusCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -36,10 +39,11 @@ vector ChimeraPerseusCommand::setParameters(){ string ChimeraPerseusCommand::getHelpString(){ try { string helpString = ""; - helpString += "The chimera.perseus command reads a fastafile and namefile and outputs potentially chimeric sequences.\n"; + helpString += "The chimera.perseus command reads a fastafile and namefile or countfile and outputs potentially chimeric sequences.\n"; helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; - helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n"; + helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n"; + helpString += "The count parameter allows you to provide a count file associated with your fasta file. A count or name file is required. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; @@ -96,6 +100,8 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(){ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { try { abort = false; calledHelp = false; + hasCount = false; + hasName = false; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -107,7 +113,7 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { OptionParser parser(option); map parameters = parser.getParameters(); - ValidParameters validParameter("chimera.uchime"); + ValidParameters validParameter("chimera.perseus"); map::iterator it; //check to make sure all parameters are valid for command @@ -203,15 +209,9 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { //check for required parameters - bool hasName = true; namefile = validParameter.validFile(parameters, "name", false); - if (namefile == "not found") { - //if there is a current fasta file, use it - string filename = m->getNameFile(); - if (filename != "") { nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); } - else { m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; } - hasName = false; - }else { + if (namefile == "not found") { namefile = ""; } + else { m->splitAtDash(namefile, nameFileNames); //go through files and make sure they are good, if not, then disregard them @@ -277,12 +277,101 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { } } } + } + + if (nameFileNames.size() != 0) { hasName = true; } + + //check for required parameters + vector countfileNames; + countfile = validParameter.validFile(parameters, "count", false); + if (countfile == "not found") { + countfile = ""; + }else { + m->splitAtDash(countfile, countfileNames); - //make sure there is at least one valid file left - if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; } + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < countfileNames.size(); i++) { + + bool ignore = false; + if (countfileNames[i] == "current") { + countfileNames[i] = m->getCountTableFile(); + if (nameFileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(countfileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { countfileNames[i] = inputDir + countfileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(countfileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + }else { + m->setCountTableFile(countfileNames[i]); + } + } + } } - - if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + + if (countfileNames.size() != 0) { hasCount = true; } + + //make sure there is at least one valid file left + if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + + if (!hasName && !hasCount) { + //if there is a current name file, use it, else look for current count file + string filename = m->getNameFile(); + if (filename != "") { hasName = true; nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); } + else { + filename = m->getCountTableFile(); + if (filename != "") { hasCount = true; countfileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("[ERROR]: You must provide a count or name file."); m->mothurOutEndLine(); abort = true; } + } + } + if (!hasName && hasCount) { nameFileNames = countfileNames; } + + if (nameFileNames.size() != fastaFileNames.size()) { m->mothurOut("[ERROR]: The number of name or count files does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } bool hasGroup = true; groupfile = validParameter.validFile(parameters, "group", false); @@ -360,6 +449,7 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -415,41 +505,82 @@ int ChimeraPerseusCommand::execute(){ int numSeqs = 0; int numChimeras = 0; - - if (groupFile != "") { - //Parse sequences by group - SequenceParser parser(groupFile, fastaFileNames[s], nameFile); - vector groups = parser.getNamesOfGroups(); - - if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - - //clears files - ofstream out, out1, out2; - m->openOutputFile(outputFileName, out); out.close(); - m->openOutputFile(accnosFileName, out1); out1.close(); - - if(processors == 1) { numSeqs = driverGroups(parser, outputFileName, accnosFileName, 0, groups.size(), groups); } - else { numSeqs = createProcessesGroups(parser, outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); } - - if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - - numChimeras = deconvoluteResults(parser, outputFileName, accnosFileName); - - m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); - - if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - - }else{ - if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } - - //read sequences and store sorted by frequency - vector sequences = readFiles(fastaFileNames[s], nameFile); - - if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - - numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras); + + if (hasCount) { + CountTable* ct = new CountTable(); + ct->readTable(nameFile); + + if (ct->hasGroupInfo()) { + cparser = new SequenceCountParser(fastaFileNames[s], *ct); + + vector groups = cparser->getNamesOfGroups(); + + if (m->control_pressed) { delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + //clears files + ofstream out, out1, out2; + m->openOutputFile(outputFileName, out); out.close(); + m->openOutputFile(accnosFileName, out1); out1.close(); + + if(processors == 1) { numSeqs = driverGroups(outputFileName, accnosFileName, 0, groups.size(), groups); } + else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); } + + if (m->control_pressed) { delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + map uniqueNames = cparser->getAllSeqsMap(); + numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName); + delete cparser; + + m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); + + if (m->control_pressed) { delete ct; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + }else { + if (processors != 1) { m->mothurOut("Your count file does not contain group information, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } + + //read sequences and store sorted by frequency + vector sequences = readFiles(fastaFileNames[s], ct); + + if (m->control_pressed) { delete ct; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras); + } + delete ct; + }else { + if (groupFile != "") { + //Parse sequences by group + parser = new SequenceParser(groupFile, fastaFileNames[s], nameFile); + vector groups = parser->getNamesOfGroups(); + + if (m->control_pressed) { delete parser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + //clears files + ofstream out, out1, out2; + m->openOutputFile(outputFileName, out); out.close(); + m->openOutputFile(accnosFileName, out1); out1.close(); + + if(processors == 1) { numSeqs = driverGroups(outputFileName, accnosFileName, 0, groups.size(), groups); } + else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); } + + if (m->control_pressed) { delete parser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + map uniqueNames = parser->getAllSeqsMap(); + numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName); + delete parser; + + m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); + + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + }else{ + if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } + + //read sequences and store sorted by frequency + vector sequences = readFiles(fastaFileNames[s], nameFile); + + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras); + } } - + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences. " + toString(numChimeras) + " chimeras were found."); m->mothurOutEndLine(); @@ -510,7 +641,7 @@ string ChimeraPerseusCommand::getNamesFile(string& inputFile){ } } //********************************************************************************************************************** -int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFName, string accnos, int start, int end, vector groups){ +int ChimeraPerseusCommand::driverGroups(string outputFName, string accnos, int start, int end, vector groups){ try { int totalSeqs = 0; @@ -522,7 +653,7 @@ int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFNa int start = time(NULL); if (m->control_pressed) { return 0; } - vector sequences = loadSequences(parser, groups[i]); + vector sequences = loadSequences(groups[i]); if (m->control_pressed) { return 0; } @@ -547,32 +678,48 @@ int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFNa } } //********************************************************************************************************************** -vector ChimeraPerseusCommand::loadSequences(SequenceParser& parser, string group){ +vector ChimeraPerseusCommand::loadSequences(string group){ try { - - vector thisGroupsSeqs = parser.getSeqs(group); - map nameMap = parser.getNameMap(group); - map::iterator it; - - vector sequences; - bool error = false; - alignLength = 0; - - for (int i = 0; i < thisGroupsSeqs.size(); i++) { - - if (m->control_pressed) { return sequences; } - - it = nameMap.find(thisGroupsSeqs[i].getName()); - if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); } - else { - int num = m->getNumNames(it->second); - sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num)); - if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } - } + bool error = false; + alignLength = 0; + vector sequences; + if (hasCount) { + vector thisGroupsSeqs = cparser->getSeqs(group); + map counts = cparser->getCountTable(group); + map::iterator it; + + for (int i = 0; i < thisGroupsSeqs.size(); i++) { + + if (m->control_pressed) { return sequences; } + + it = counts.find(thisGroupsSeqs[i].getName()); + if (it == counts.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); m->mothurOutEndLine(); } + else { + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second)); + if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } + } + } + }else{ + vector thisGroupsSeqs = parser->getSeqs(group); + map nameMap = parser->getNameMap(group); + map::iterator it; + + for (int i = 0; i < thisGroupsSeqs.size(); i++) { + + if (m->control_pressed) { return sequences; } + + it = nameMap.find(thisGroupsSeqs[i].getName()); + if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); } + else { + int num = m->getNumNames(it->second); + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num)); + if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } + } + } + } - if (error) { m->control_pressed = true; } - + if (error) { m->control_pressed = true; } //sort by frequency sort(sequences.rbegin(), sequences.rend()); @@ -619,6 +766,37 @@ vector ChimeraPerseusCommand::readFiles(string inputFile, string name){ return sequences; } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "readFiles"); + exit(1); + } +} +//********************************************************************************************************************** +vector ChimeraPerseusCommand::readFiles(string inputFile, CountTable* ct){ + try { + //read fasta file and create sequenceData structure - checking for file mismatches + vector sequences; + ifstream in; + m->openInputFile(inputFile, in); + alignLength = 0; + + while (!in.eof()) { + Sequence temp(in); m->gobble(in); + + int count = ct->getNumSeqs(temp.getName()); + if (m->control_pressed) { break; } + else { + sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), count)); + if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); } + } + } + in.close(); + + //sort by frequency + sort(sequences.rbegin(), sequences.rend()); + + return sequences; + } catch(exception& e) { m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile"); exit(1); @@ -771,7 +949,7 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque } } /**************************************************************************************************/ -int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string accnos, vector groups, string group, string fasta, string name) { +int ChimeraPerseusCommand::createProcessesGroups(string outputFName, string accnos, vector groups, string group, string fasta, string name) { try { vector processIDS; @@ -801,7 +979,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driverGroups(parser, outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups); + num = driverGroups(outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups); //pass numSeqs to parent ofstream out; @@ -819,7 +997,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string } //do my part - num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups); + num = driverGroups(outputFName, accnos, lines[0].start, lines[0].end, groups); //force parent to wait until all the processes are done for (int i=0;i& uniqueNames, string outputFileName, string accnosFileName){ try { - map uniqueNames = parser.getAllSeqsMap(); map::iterator itUnique; int total = 0; diff --git a/chimeraperseuscommand.h b/chimeraperseuscommand.h index 1c4baa3..3608835 100644 --- a/chimeraperseuscommand.h +++ b/chimeraperseuscommand.h @@ -16,7 +16,9 @@ #include "mothur.h" #include "command.hpp" #include "sequenceparser.h" +#include "sequencecountparser.h" #include "myPerseus.h" +#include "counttable.h" /***********************************************************/ class ChimeraPerseusCommand : public Command { @@ -43,10 +45,12 @@ private: linePair(int i, int j) : start(i), end(j) {} }; - bool abort; - string fastafile, groupfile, outputDir, namefile; + bool abort, hasName, hasCount; + string fastafile, groupfile, countfile, outputDir, namefile; int processors, alignLength; double cutoff, alpha, beta; + SequenceParser* parser; + SequenceCountParser* cparser; vector outputNames; vector fastaFileNames; @@ -56,10 +60,11 @@ private: string getNamesFile(string&); int driver(string, vector&, string, int&); vector readFiles(string, string); - vector loadSequences(SequenceParser&, string); - int deconvoluteResults(SequenceParser&, string, string); - int driverGroups(SequenceParser&, string, string, int, int, vector); - int createProcessesGroups(SequenceParser&, string, string, vector, string, string, string); + vector readFiles(string inputFile, CountTable* ct); + vector loadSequences(string); + int deconvoluteResults(map&, string, string); + int driverGroups(string, string, int, int, vector); + int createProcessesGroups(string, string, vector, string, string, string); }; /**************************************************************************************************/ @@ -75,12 +80,13 @@ struct perseusData { MothurOut* m; int start; int end; + bool hasName, hasCount; int threadID, count, numChimeras; double alpha, beta, cutoff; vector groups; perseusData(){} - perseusData(double a, double b, double c, string o, string f, string n, string g, string ac, vector gr, MothurOut* mout, int st, int en, int tid) { + perseusData(bool hn, bool hc, double a, double b, double c, string o, string f, string n, string g, string ac, vector gr, MothurOut* mout, int st, int en, int tid) { alpha = a; beta = b; cutoff = c; @@ -94,6 +100,8 @@ struct perseusData { end = en; threadID = tid; groups = gr; + hasName = hn; + hasCount = hc; count = 0; numChimeras = 0; } @@ -114,38 +122,67 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ //parse fasta and name file by group SequenceParser* parser; - if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } - else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } - + SequenceCountParser* cparser; + if (pDataArray->hasCount) { + CountTable* ct = new CountTable(); + ct->readTable(pDataArray->namefile); + cparser = new SequenceCountParser(pDataArray->fastafile, *ct); + delete ct; + }else { + if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } + else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } + } + int totalSeqs = 0; int numChimeras = 0; for (int i = pDataArray->start; i < pDataArray->end; i++) { - int start = time(NULL); if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + int start = time(NULL); if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[i] + "..."); pDataArray->m->mothurOutEndLine(); //vector sequences = loadSequences(parser, groups[i]); - same function below //////////////////////////////////////////////////////////////////////////////////////// - vector thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]); - map nameMap = parser->getNameMap(pDataArray->groups[i]); - map::iterator it; - - vector sequences; bool error = false; - - for (int j = 0; j < thisGroupsSeqs.size(); j++) { - - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } - - it = nameMap.find(thisGroupsSeqs[j].getName()); - if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[j].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } - else { - int num = pDataArray->m->getNumNames(it->second); - sequences.push_back(seqData(thisGroupsSeqs[j].getName(), thisGroupsSeqs[j].getUnaligned(), num)); - } - } + int alignLength = 0; + vector sequences; + if (pDataArray->hasCount) { + vector thisGroupsSeqs = cparser->getSeqs(pDataArray->groups[i]); + map counts = cparser->getCountTable(pDataArray->groups[i]); + map::iterator it; + + for (int i = 0; i < thisGroupsSeqs.size(); i++) { + + if (pDataArray->m->control_pressed) { break; } + + it = counts.find(thisGroupsSeqs[i].getName()); + if (it == counts.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); } + else { + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second)); + if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } + } + } + }else{ + vector thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]); + map nameMap = parser->getNameMap(pDataArray->groups[i]); + map::iterator it; + + for (int i = 0; i < thisGroupsSeqs.size(); i++) { + + if (pDataArray->m->control_pressed) { break; } + + it = nameMap.find(thisGroupsSeqs[i].getName()); + if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + else { + int num = pDataArray->m->getNumNames(it->second); + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num)); + if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } + } + } + + } + if (error) { pDataArray->m->control_pressed = true; } @@ -153,7 +190,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ sort(sequences.rbegin(), sequences.rend()); //////////////////////////////////////////////////////////////////////////////////////// - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below //////////////////////////////////////////////////////////////////////////////////////// @@ -184,7 +221,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ } int numSeqs = sequences.size(); - int alignLength = sequences[0].sequence.size(); + //int alignLength = sequences[0].sequence.size(); ofstream chimeraFile; ofstream accnosFile; @@ -200,7 +237,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ for(int j=0;jm->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } vector restricted = chimeras; @@ -217,7 +254,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi; @@ -226,7 +263,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ if(comparisons >= 2){ minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } int minMismatchToTrimera = numeric_limits::max(); int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB; @@ -234,12 +271,12 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ if(minMismatchToChimera >= 3 && comparisons >= 3){ minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } } double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } string type; string chimeraRefSeq; @@ -253,16 +290,16 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps); } - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq); double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t'; chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t'; @@ -304,11 +341,11 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName); pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[i] + "."); pDataArray->m->mothurOutEndLine(); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } } pDataArray->count = totalSeqs; - delete parser; + if (pDataArray->hasCount) { delete cparser; } { delete parser; } return totalSeqs; } diff --git a/clustercommand.cpp b/clustercommand.cpp index 5a46996..06e627a 100644 --- a/clustercommand.cpp +++ b/clustercommand.cpp @@ -154,6 +154,14 @@ ClusterCommand::ClusterCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters diff --git a/counttable.cpp b/counttable.cpp index a664228..005ab8a 100644 --- a/counttable.cpp +++ b/counttable.cpp @@ -174,6 +174,75 @@ int CountTable::get(string seqName) { exit(1); } } +/************************************************************/ +//add seqeunce without group info +int CountTable::push_back(string seqName) { + try { + map::iterator it = indexNameMap.find(seqName); + if (it == indexNameMap.end()) { + if (hasGroups) { m->mothurOut("[ERROR]: Your count table has groups and I have no group information for " + seqName + "."); m->mothurOutEndLine(); m->control_pressed = true; } + indexNameMap[seqName] = uniques; + totals.push_back(1); + total++; + uniques++; + }else { + m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "push_back"); + exit(1); + } +} +/************************************************************/ +//add seqeunce without group info +int CountTable::push_back(string seqName, int thisTotal) { + try { + map::iterator it = indexNameMap.find(seqName); + if (it == indexNameMap.end()) { + if (hasGroups) { m->mothurOut("[ERROR]: Your count table has groups and I have no group information for " + seqName + "."); m->mothurOutEndLine(); m->control_pressed = true; } + indexNameMap[seqName] = uniques; + totals.push_back(thisTotal); + total+=thisTotal; + uniques++; + }else { + m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "push_back"); + exit(1); + } +} +/************************************************************/ +//add sequence with group info +int CountTable::push_back(string seqName, vector groupCounts) { + try { + map::iterator it = indexNameMap.find(seqName); + if (it == indexNameMap.end()) { + if ((hasGroups) && (groupCounts.size() != getNumGroups())) { m->mothurOut("[ERROR]: Your count table has a " + toString(getNumGroups()) + " groups and " + seqName + " has " + toString(groupCounts.size()) + ", please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + int thisTotal = 0; + for (int i = 0; i < getNumGroups(); i++) { totalGroups[i] += groupCounts[i]; thisTotal += groupCounts[i]; } + indexNameMap[seqName] = uniques; + totals.push_back(thisTotal); + total+= thisTotal; + uniques++; + }else { + m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "push_back"); + exit(1); + } +} + /************************************************************/ //create ListVector from uniques ListVector CountTable::getListVector() { diff --git a/counttable.h b/counttable.h index 8baff30..adef853 100644 --- a/counttable.h +++ b/counttable.h @@ -51,6 +51,12 @@ class CountTable { bool hasGroupInfo() { return hasGroups; } int getNumGroups() { return groups.size(); } vector getNamesOfGroups() { return groups; } //returns group names, if no group info vector is blank. + + int push_back(string); //add a sequence + int push_back(string, int); //add a sequence + int push_back(string, vector); //add a sequence with group info + int get(string); //returns unique sequence index for reading distance matrices like NameAssignment + int size() { return indexNameMap.size(); } vector getGroupCounts(string); //returns group counts for a seq passed in, if no group info is in file vector is blank. Order is the same as the groups returned by getGroups function. int getGroupCount(string, string); //returns number of seqs for that group for that seq @@ -59,11 +65,10 @@ class CountTable { int getNumSeqs() { return total; } //return total number of seqs int getNumUniqueSeqs() { return uniques; } //return number of unique/representative seqs int getGroupIndex(string); //returns index in getGroupCounts vector of specific group + vector getNamesOfSeqs(); int mergeCounts(string, string); //combines counts for 2 seqs, saving under the first name passed in. - int get(string); //returns unique sequence index for reading distance matrices like NameAssignment ListVector getListVector(); - int size() { return indexNameMap.size(); } private: string filename; diff --git a/readcluster.cpp b/readcluster.cpp index b6cb71d..a6adabb 100644 --- a/readcluster.cpp +++ b/readcluster.cpp @@ -42,6 +42,26 @@ int ReadCluster::read(NameAssignment*& nameMap){ } } /***********************************************************************/ +int ReadCluster::read(CountTable*& ct){ + try { + + if (format == "phylip") { convertPhylip2Column(ct); } + else { list = new ListVector(ct->getListVector()); } + + if (m->control_pressed) { return 0; } + + if (sortWanted) { OutPutFile = m->sortFile(distFile, outputDir); } + else { OutPutFile = distFile; } //for use by clusters splitMatrix to convert a phylip matrix to column + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ReadCluster", "read"); + exit(1); + } +} +/***********************************************************************/ int ReadCluster::convertPhylip2Column(NameAssignment*& nameMap){ try { @@ -224,6 +244,181 @@ int ReadCluster::convertPhylip2Column(NameAssignment*& nameMap){ } /***********************************************************************/ +int ReadCluster::convertPhylip2Column(CountTable*& ct){ + try { + //convert phylip file to column file + map rowToName; + map::iterator it; + + ifstream in; + ofstream out; + string tempFile = distFile + ".column.temp"; + + m->openInputFile(distFile, in); m->gobble(in); + m->openOutputFile(tempFile, out); + + float distance; + int square, nseqs; + string name; + vector matrixNames; + + string numTest; + in >> numTest >> name; + + if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); } + else { convert(numTest, nseqs); } + + rowToName[0] = name; + matrixNames.push_back(name); + + if(ct == NULL){ + list = new ListVector(nseqs); + list->set(0, name); + } + else{ list = new ListVector(ct->getListVector()); } + + char d; + while((d=in.get()) != EOF){ + + if(isalnum(d)){ + square = 1; + in.putback(d); + for(int i=0;i> distance; + } + break; + } + if(d == '\n'){ + square = 0; + break; + } + } + + if(square == 0){ + + for(int i=1;i> name; + rowToName[i] = name; + matrixNames.push_back(name); + + //there's A LOT of repeated code throughout this method... + if(ct == NULL){ + list->set(i, name); + + for(int j=0;jcontrol_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; } + + in >> distance; + + if (distance == -1) { distance = 1000000; } + + if(distance < cutoff){ + out << i << '\t' << j << '\t' << distance << endl; + } + } + + } + else{ + + for(int j=0;jcontrol_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; } + + in >> distance; + + if (distance == -1) { distance = 1000000; } + + if(distance < cutoff){ + out << i << '\t' << j << '\t' << distance << endl; + } + } + } + } + } + else{ + for(int i=1;i> name; + rowToName[i] = name; + matrixNames.push_back(name); + + if(ct == NULL){ + list->set(i, name); + for(int j=0;jcontrol_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; } + + in >> distance; + + if (distance == -1) { distance = 1000000; } + + if(distance < cutoff && j < i){ + out << i << '\t' << j << '\t' << distance << endl; + } + } + } + else{ + for(int j=0;jcontrol_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; } + + in >> distance; + + if (distance == -1) { distance = 1000000; } + + if(distance < cutoff && j < i){ + out << i << '\t' << j << '\t' << distance << endl; + } + + } + } + } + } + + list->setLabel("0"); + in.close(); + out.close(); + + if(ct == NULL){ + ct = new CountTable(); + for(int i=0;ipush_back(matrixNames[i]); + } + } + + + ifstream in2; + ofstream out2; + + string outputFile = m->getRootName(distFile) + "column.dist"; + m->openInputFile(tempFile, in2); + m->openOutputFile(outputFile, out2); + + int first, second; + float dist; + + while (in2) { + if (m->control_pressed) { in2.close(); out2.close(); m->mothurRemove(tempFile); m->mothurRemove(outputFile); return 0; } + + in2 >> first >> second >> dist; + out2 << rowToName[first] << '\t' << rowToName[second] << '\t' << dist << endl; + m->gobble(in2); + } + in2.close(); + out2.close(); + + m->mothurRemove(tempFile); + distFile = outputFile; + + if (m->control_pressed) { m->mothurRemove(outputFile); } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ReadCluster", "convertPhylip2Column"); + exit(1); + } +} +/***********************************************************************/ + ReadCluster::~ReadCluster(){} /***********************************************************************/ diff --git a/readcluster.h b/readcluster.h index a838dac..7ea579c 100644 --- a/readcluster.h +++ b/readcluster.h @@ -13,6 +13,7 @@ #include "mothur.h" #include "nameassignment.hpp" #include "listvector.hpp" +#include "counttable.h" /******************************************************/ @@ -23,6 +24,7 @@ public: ReadCluster(string, float, string, bool); ~ReadCluster(); int read(NameAssignment*&); + int read(CountTable*&); string getOutputFile() { return OutPutFile; } void setFormat(string f) { format = f; } ListVector* getListVector() { return list; } @@ -36,6 +38,7 @@ private: bool sortWanted; int convertPhylip2Column(NameAssignment*&); + int convertPhylip2Column(CountTable*&); }; /******************************************************/ diff --git a/sequencecountparser.cpp b/sequencecountparser.cpp new file mode 100644 index 0000000..4ba6091 --- /dev/null +++ b/sequencecountparser.cpp @@ -0,0 +1,289 @@ +// +// sequencecountparser.cpp +// Mothur +// +// Created by Sarah Westcott on 8/7/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "sequencecountparser.h" + +/************************************************************/ +SequenceCountParser::SequenceCountParser(string countfile, string fastafile) { + try { + + m = MothurOut::getInstance(); + + //read count file + CountTable countTable; + countTable.readTable(countfile); + + //initialize maps + namesOfGroups = countTable.getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { + vector temp; + map tempMap; + seqs[namesOfGroups[i]] = temp; + countTablePerGroup[namesOfGroups[i]] = tempMap; + } + + //read fasta file making sure each sequence is in the group file + ifstream in; + m->openInputFile(fastafile, in); + + int fastaCount = 0; + while (!in.eof()) { + + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + fastaCount++; + if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } } + + if (seq.getName() != "") { + + allSeqsMap[seq.getName()] = seq.getName(); + vector groupCounts = countTable.getGroupCounts(seq.getName()); + + for (int i = 0; i < namesOfGroups.size(); i++) { + if (groupCounts[i] != 0) { + seqs[namesOfGroups[i]].push_back(seq); + countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i]; + } + } + } + } + in.close(); + } + catch(exception& e) { + m->errorOut(e, "SequenceCountParser", "SequenceCountParser"); + exit(1); + } +} +/************************************************************/ +SequenceCountParser::SequenceCountParser(string fastafile, CountTable& countTable) { + try { + + m = MothurOut::getInstance(); + + //initialize maps + if (countTable.hasGroupInfo()) { + namesOfGroups = countTable.getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { + vector temp; + map tempMap; + seqs[namesOfGroups[i]] = temp; + countTablePerGroup[namesOfGroups[i]] = tempMap; + } + + //read fasta file making sure each sequence is in the group file + ifstream in; + m->openInputFile(fastafile, in); + + int fastaCount = 0; + while (!in.eof()) { + + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + fastaCount++; + if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } } + + if (seq.getName() != "") { + + allSeqsMap[seq.getName()] = seq.getName(); + vector groupCounts = countTable.getGroupCounts(seq.getName()); + + for (int i = 0; i < namesOfGroups.size(); i++) { + if (groupCounts[i] != 0) { + seqs[namesOfGroups[i]].push_back(seq); + countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i]; + } + } + } + } + in.close(); + }else { m->control_pressed = true; m->mothurOut("[ERROR]: cannot parse fasta file by group with a count table that does not include group data, please correct.\n"); } + + } + catch(exception& e) { + m->errorOut(e, "SequenceCountParser", "SequenceCountParser"); + exit(1); + } +} +/************************************************************/ +SequenceCountParser::~SequenceCountParser(){ } +/************************************************************/ +int SequenceCountParser::getNumGroups(){ return namesOfGroups.size(); } +/************************************************************/ +vector SequenceCountParser::getNamesOfGroups(){ return namesOfGroups; } +/************************************************************/ +int SequenceCountParser::getNumSeqs(string g){ + try { + map >::iterator it; + int num = 0; + + it = seqs.find(g); + if(it == seqs.end()) { + m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine(); + }else { + num = (it->second).size(); + } + + return num; + } + catch(exception& e) { + m->errorOut(e, "SequenceCountParser", "getNumSeqs"); + exit(1); + } +} +/************************************************************/ +vector SequenceCountParser::getSeqs(string g){ + try { + map >::iterator it; + vector seqForThisGroup; + + it = seqs.find(g); + if(it == seqs.end()) { + m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + seqForThisGroup = it->second; + if (m->debug) { m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences."); } + } + + return seqForThisGroup; + } + catch(exception& e) { + m->errorOut(e, "SequenceCountParser", "getSeqs"); + exit(1); + } +} +/************************************************************/ +int SequenceCountParser::getSeqs(string g, string filename, bool uchimeFormat=false){ + try { + map >::iterator it; + vector seqForThisGroup; + vector nameVector; + + it = seqs.find(g); + if(it == seqs.end()) { + m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + + ofstream out; + m->openOutputFile(filename, out); + + seqForThisGroup = it->second; + + if (uchimeFormat) { + // format should look like + //>seqName /ab=numRedundantSeqs/ + //sequence + + map countForThisGroup = getCountTable(g); + map::iterator itCount; + int error = 0; + + for (int i = 0; i < seqForThisGroup.size(); i++) { + itCount = countForThisGroup.find(seqForThisGroup[i].getName()); + + if (itCount == countForThisGroup.end()){ + error = 1; + m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your count file, please correct."); m->mothurOutEndLine(); + }else { + seqPriorityNode temp(itCount->second, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName()); + nameVector.push_back(temp); + } + } + + if (error == 1) { out.close(); m->mothurRemove(filename); return 1; } + + //sort by num represented + sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes); + + //print new file in order of + for (int i = 0; i < nameVector.size(); i++) { + + if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } + + out << ">" << nameVector[i].name << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl; + } + + }else { + //m->mothurOut("Group " + g + " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n"); + for (int i = 0; i < seqForThisGroup.size(); i++) { + + if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } + + seqForThisGroup[i].printSequence(out); + } + } + out.close(); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SequenceCountParser", "getSeqs"); + exit(1); + } +} + +/************************************************************/ +map SequenceCountParser::getCountTable(string g){ + try { + map >::iterator it; + map countForThisGroup; + + it = countTablePerGroup.find(g); + if(it == countTablePerGroup.end()) { + m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + countForThisGroup = it->second; + if (m->debug) { m->mothurOut("[DEBUG]: group " + g + " count file has " + toString(countForThisGroup.size()) + " unique sequences."); } + } + + return countForThisGroup; + } + catch(exception& e) { + m->errorOut(e, "SequenceCountParser", "getCountTable"); + exit(1); + } +} +/************************************************************/ +int SequenceCountParser::getCountTable(string g, string filename){ + try { + map >::iterator it; + map countForThisGroup; + + it = countTablePerGroup.find(g); + if(it == countTablePerGroup.end()) { + m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + countForThisGroup = it->second; + + ofstream out; + m->openOutputFile(filename, out); + out << "Representative_Sequence\ttotal\t" << g << endl; + + for (map::iterator itFile = countForThisGroup.begin(); itFile != countForThisGroup.end(); itFile++) { + + if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } + + out << itFile->first << '\t' << itFile->second << '\t' << itFile->second << endl; + } + + out.close(); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "getCountTable"); + exit(1); + } +} +/************************************************************/ + + + diff --git a/sequencecountparser.h b/sequencecountparser.h new file mode 100644 index 0000000..4889ea6 --- /dev/null +++ b/sequencecountparser.h @@ -0,0 +1,59 @@ +#ifndef Mothur_sequencecountparser_h +#define Mothur_sequencecountparser_h + +// +// sequencecountparser.h +// Mothur +// +// Created by Sarah Westcott on 8/7/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "mothur.h" +#include "mothurout.h" +#include "sequence.hpp" +#include "counttable.h" + +/* This class reads a fasta and count file and parses the data by group. The countfile must contain group information. + + Note: The sum of all the groups unique sequences will be larger than the original number of unique sequences. + This is because when we parse the count file we make a unique for each group instead of 1 unique for all + groups. + + */ + +class SequenceCountParser { + +public: + + SequenceCountParser(string, string); //count, fasta - file mismatches will set m->control_pressed = true + SequenceCountParser(string, CountTable&); //fasta, counttable - file mismatches will set m->control_pressed = true + ~SequenceCountParser(); + + //general operations + int getNumGroups(); + vector getNamesOfGroups(); + + int getNumSeqs(string); //returns the number of unique sequences in a specific group + vector getSeqs(string); //returns unique sequences in a specific group + map getCountTable(string); //returns seqName -> numberOfRedundantSeqs for a specific group - the count file format, but each line is parsed by group. + + int getSeqs(string, string, bool); //prints unique sequences in a specific group to a file - group, filename, uchimeFormat=false + int getCountTable(string, string); //print seqName -> numberRedundantSeqs for a specific group - group, filename + + map getAllSeqsMap(){ return allSeqsMap; } //returns map where the key=sequenceName and the value=representativeSequence - helps us remove duplicates after group by group processing +private: + + CountTable countTable; + MothurOut* m; + + int numSeqs; + map allSeqsMap; + map > seqs; //a vector for each group + map > countTablePerGroup; //countTable for each group + vector namesOfGroups; +}; + + + +#endif diff --git a/sequenceparser.cpp b/sequenceparser.cpp index 2aff07e..63010f3 100644 --- a/sequenceparser.cpp +++ b/sequenceparser.cpp @@ -310,8 +310,6 @@ vector SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGr /************************************************************/ bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); } /************************************************************/ -string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); } -/************************************************************/ int SequenceParser::getNumSeqs(string g){ try { map >::iterator it; diff --git a/sequenceparser.h b/sequenceparser.h index 23fcb9e..98438f6 100644 --- a/sequenceparser.h +++ b/sequenceparser.h @@ -36,7 +36,6 @@ class SequenceParser { int getNumGroups(); vector getNamesOfGroups(); bool isValidGroup(string); //return true if string is a valid group - string getGroup(string); //returns group of a specific sequence int getNumSeqs(string); //returns the number of unique sequences in a specific group vector getSeqs(string); //returns unique sequences in a specific group -- 2.39.2