A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73901071588C40900ED2ED6 /* loadlogfilecommand.cpp */; };
A73DDBBA13C4A0D1006AAE38 /* clearmemorycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */; };
A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */; };
+ A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */; };
A74A9A9F148E881E00AB5E3E /* spline.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74A9A9E148E881E00AB5E3E /* spline.cpp */; };
A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; };
A74D59A4159A1E2000043046 /* counttable.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D59A3159A1E2000043046 /* counttable.cpp */; };
A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clearmemorycommand.cpp; sourceTree = "<group>"; };
A73DDC3613C4BF64006AAE38 /* mothurmetastats.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mothurmetastats.h; sourceTree = "<group>"; };
A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mothurmetastats.cpp; sourceTree = "<group>"; };
+ A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequencecountparser.cpp; sourceTree = "<group>"; };
+ A741FAD415D168A00067BCC5 /* sequencecountparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequencecountparser.h; sourceTree = "<group>"; };
A74A9A9D148E881E00AB5E3E /* spline.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = spline.h; sourceTree = "<group>"; };
A74A9A9E148E881E00AB5E3E /* spline.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = spline.cpp; sourceTree = "<group>"; };
A74D36B6137DAFAA00332B0C /* chimerauchimecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerauchimecommand.h; sourceTree = "<group>"; };
A7E9B7D012D37EC400DA6239 /* sabundvector.hpp */,
A7E9B7DB12D37EC400DA6239 /* sequence.cpp */,
A7E9B7DC12D37EC400DA6239 /* sequence.hpp */,
+ A741FAD415D168A00067BCC5 /* sequencecountparser.h */,
+ A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */,
A7E9B7DD12D37EC400DA6239 /* sequencedb.cpp */,
A7E9B7DE12D37EC400DA6239 /* sequencedb.h */,
A7F9F5CD141A5E500032F693 /* sequenceparser.h */,
A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */,
A74D59A4159A1E2000043046 /* counttable.cpp in Sources */,
A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */,
+ A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
#include "chimeraperseuscommand.h"
#include "deconvolutecommand.h"
#include "sequence.hpp"
+#include "counttable.h"
+#include "sequencecountparser.h"
//**********************************************************************************************************************
vector<string> ChimeraPerseusCommand::setParameters(){
try {
CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
- CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
- CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+ CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+ CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+ CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
string ChimeraPerseusCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The chimera.perseus command reads a fastafile and namefile and outputs potentially chimeric sequences.\n";
+ helpString += "The chimera.perseus command reads a fastafile and namefile or countfile and outputs potentially chimeric sequences.\n";
helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n";
helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
- helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n";
+ helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n";
+ helpString += "The count parameter allows you to provide a count file associated with your fasta file. A count or name file is required. \n";
helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
ChimeraPerseusCommand::ChimeraPerseusCommand(string option) {
try {
abort = false; calledHelp = false;
+ hasCount = false;
+ hasName = false;
//allow user to run help
if(option == "help") { help(); abort = true; calledHelp = true; }
OptionParser parser(option);
map<string,string> parameters = parser.getParameters();
- ValidParameters validParameter("chimera.uchime");
+ ValidParameters validParameter("chimera.perseus");
map<string,string>::iterator it;
//check to make sure all parameters are valid for command
//check for required parameters
- bool hasName = true;
namefile = validParameter.validFile(parameters, "name", false);
- if (namefile == "not found") {
- //if there is a current fasta file, use it
- string filename = m->getNameFile();
- if (filename != "") { nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); }
- else { m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
- hasName = false;
- }else {
+ if (namefile == "not found") { namefile = ""; }
+ else {
m->splitAtDash(namefile, nameFileNames);
//go through files and make sure they are good, if not, then disregard them
}
}
}
+ }
+
+ if (nameFileNames.size() != 0) { hasName = true; }
+
+ //check for required parameters
+ vector<string> countfileNames;
+ countfile = validParameter.validFile(parameters, "count", false);
+ if (countfile == "not found") {
+ countfile = "";
+ }else {
+ m->splitAtDash(countfile, countfileNames);
- //make sure there is at least one valid file left
- if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; }
+ //go through files and make sure they are good, if not, then disregard them
+ for (int i = 0; i < countfileNames.size(); i++) {
+
+ bool ignore = false;
+ if (countfileNames[i] == "current") {
+ countfileNames[i] = m->getCountTableFile();
+ if (nameFileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); }
+ else {
+ m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true;
+ //erase from file list
+ countfileNames.erase(countfileNames.begin()+i);
+ i--;
+ }
+ }
+
+ if (!ignore) {
+
+ if (inputDir != "") {
+ string path = m->hasPath(countfileNames[i]);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { countfileNames[i] = inputDir + countfileNames[i]; }
+ }
+
+ int ableToOpen;
+ ifstream in;
+
+ ableToOpen = m->openInputFile(countfileNames[i], in, "noerror");
+
+ //if you can't open it, try default location
+ if (ableToOpen == 1) {
+ if (m->getDefaultPath() != "") { //default path is set
+ string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]);
+ m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ countfileNames[i] = tryPath;
+ }
+ }
+
+ if (ableToOpen == 1) {
+ if (m->getOutputDir() != "") { //default path is set
+ string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]);
+ m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ countfileNames[i] = tryPath;
+ }
+ }
+
+ in.close();
+
+ if (ableToOpen == 1) {
+ m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
+ //erase from file list
+ countfileNames.erase(countfileNames.begin()+i);
+ i--;
+ }else {
+ m->setCountTableFile(countfileNames[i]);
+ }
+ }
+ }
}
-
- if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
+
+ if (countfileNames.size() != 0) { hasCount = true; }
+
+ //make sure there is at least one valid file left
+ if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
+
+ if (!hasName && !hasCount) {
+ //if there is a current name file, use it, else look for current count file
+ string filename = m->getNameFile();
+ if (filename != "") { hasName = true; nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); }
+ else {
+ filename = m->getCountTableFile();
+ if (filename != "") { hasCount = true; countfileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the count parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("[ERROR]: You must provide a count or name file."); m->mothurOutEndLine(); abort = true; }
+ }
+ }
+ if (!hasName && hasCount) { nameFileNames = countfileNames; }
+
+ if (nameFileNames.size() != fastaFileNames.size()) { m->mothurOut("[ERROR]: The number of name or count files does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
bool hasGroup = true;
groupfile = validParameter.validFile(parameters, "group", false);
if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
+ if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; }
//if the user changes the output directory command factory will send this info to us in the output parameter
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
int numSeqs = 0;
int numChimeras = 0;
-
- if (groupFile != "") {
- //Parse sequences by group
- SequenceParser parser(groupFile, fastaFileNames[s], nameFile);
- vector<string> groups = parser.getNamesOfGroups();
-
- if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
-
- //clears files
- ofstream out, out1, out2;
- m->openOutputFile(outputFileName, out); out.close();
- m->openOutputFile(accnosFileName, out1); out1.close();
-
- if(processors == 1) { numSeqs = driverGroups(parser, outputFileName, accnosFileName, 0, groups.size(), groups); }
- else { numSeqs = createProcessesGroups(parser, outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); }
-
- if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
-
- numChimeras = deconvoluteResults(parser, outputFileName, accnosFileName);
-
- m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine();
-
- if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
-
- }else{
- if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
-
- //read sequences and store sorted by frequency
- vector<seqData> sequences = readFiles(fastaFileNames[s], nameFile);
-
- if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
-
- numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras);
+
+ if (hasCount) {
+ CountTable* ct = new CountTable();
+ ct->readTable(nameFile);
+
+ if (ct->hasGroupInfo()) {
+ cparser = new SequenceCountParser(fastaFileNames[s], *ct);
+
+ vector<string> groups = cparser->getNamesOfGroups();
+
+ if (m->control_pressed) { delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+
+ //clears files
+ ofstream out, out1, out2;
+ m->openOutputFile(outputFileName, out); out.close();
+ m->openOutputFile(accnosFileName, out1); out1.close();
+
+ if(processors == 1) { numSeqs = driverGroups(outputFileName, accnosFileName, 0, groups.size(), groups); }
+ else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); }
+
+ if (m->control_pressed) { delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+ map<string, string> uniqueNames = cparser->getAllSeqsMap();
+ numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+ delete cparser;
+
+ m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine();
+
+ if (m->control_pressed) { delete ct; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+
+ }else {
+ if (processors != 1) { m->mothurOut("Your count file does not contain group information, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
+
+ //read sequences and store sorted by frequency
+ vector<seqData> sequences = readFiles(fastaFileNames[s], ct);
+
+ if (m->control_pressed) { delete ct; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+
+ numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras);
+ }
+ delete ct;
+ }else {
+ if (groupFile != "") {
+ //Parse sequences by group
+ parser = new SequenceParser(groupFile, fastaFileNames[s], nameFile);
+ vector<string> groups = parser->getNamesOfGroups();
+
+ if (m->control_pressed) { delete parser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+
+ //clears files
+ ofstream out, out1, out2;
+ m->openOutputFile(outputFileName, out); out.close();
+ m->openOutputFile(accnosFileName, out1); out1.close();
+
+ if(processors == 1) { numSeqs = driverGroups(outputFileName, accnosFileName, 0, groups.size(), groups); }
+ else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); }
+
+ if (m->control_pressed) { delete parser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+ map<string, string> uniqueNames = parser->getAllSeqsMap();
+ numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+ delete parser;
+
+ m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine();
+
+ if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+ }else{
+ if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
+
+ //read sequences and store sorted by frequency
+ vector<seqData> sequences = readFiles(fastaFileNames[s], nameFile);
+
+ if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+
+ numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras);
+ }
}
-
+
if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences. " + toString(numChimeras) + " chimeras were found."); m->mothurOutEndLine();
}
}
//**********************************************************************************************************************
-int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFName, string accnos, int start, int end, vector<string> groups){
+int ChimeraPerseusCommand::driverGroups(string outputFName, string accnos, int start, int end, vector<string> groups){
try {
int totalSeqs = 0;
int start = time(NULL); if (m->control_pressed) { return 0; }
- vector<seqData> sequences = loadSequences(parser, groups[i]);
+ vector<seqData> sequences = loadSequences(groups[i]);
if (m->control_pressed) { return 0; }
}
}
//**********************************************************************************************************************
-vector<seqData> ChimeraPerseusCommand::loadSequences(SequenceParser& parser, string group){
+vector<seqData> ChimeraPerseusCommand::loadSequences(string group){
try {
-
- vector<Sequence> thisGroupsSeqs = parser.getSeqs(group);
- map<string, string> nameMap = parser.getNameMap(group);
- map<string, string>::iterator it;
-
- vector<seqData> sequences;
- bool error = false;
- alignLength = 0;
-
- for (int i = 0; i < thisGroupsSeqs.size(); i++) {
-
- if (m->control_pressed) { return sequences; }
-
- it = nameMap.find(thisGroupsSeqs[i].getName());
- if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
- else {
- int num = m->getNumNames(it->second);
- sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
- if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
- }
+ bool error = false;
+ alignLength = 0;
+ vector<seqData> sequences;
+ if (hasCount) {
+ vector<Sequence> thisGroupsSeqs = cparser->getSeqs(group);
+ map<string, int> counts = cparser->getCountTable(group);
+ map<string, int>::iterator it;
+
+ for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+
+ if (m->control_pressed) { return sequences; }
+
+ it = counts.find(thisGroupsSeqs[i].getName());
+ if (it == counts.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); m->mothurOutEndLine(); }
+ else {
+ sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second));
+ if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
+ }
+ }
+ }else{
+ vector<Sequence> thisGroupsSeqs = parser->getSeqs(group);
+ map<string, string> nameMap = parser->getNameMap(group);
+ map<string, string>::iterator it;
+
+ for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+
+ if (m->control_pressed) { return sequences; }
+
+ it = nameMap.find(thisGroupsSeqs[i].getName());
+ if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
+ else {
+ int num = m->getNumNames(it->second);
+ sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
+ if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
+ }
+ }
+
}
- if (error) { m->control_pressed = true; }
-
+ if (error) { m->control_pressed = true; }
//sort by frequency
sort(sequences.rbegin(), sequences.rend());
return sequences;
}
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "readFiles");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, CountTable* ct){
+ try {
+ //read fasta file and create sequenceData structure - checking for file mismatches
+ vector<seqData> sequences;
+ ifstream in;
+ m->openInputFile(inputFile, in);
+ alignLength = 0;
+
+ while (!in.eof()) {
+ Sequence temp(in); m->gobble(in);
+
+ int count = ct->getNumSeqs(temp.getName());
+ if (m->control_pressed) { break; }
+ else {
+ sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), count));
+ if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); }
+ }
+ }
+ in.close();
+
+ //sort by frequency
+ sort(sequences.rbegin(), sequences.rend());
+
+ return sequences;
+ }
catch(exception& e) {
m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile");
exit(1);
}
}
/**************************************************************************************************/
-int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string accnos, vector<string> groups, string group, string fasta, string name) {
+int ChimeraPerseusCommand::createProcessesGroups(string outputFName, string accnos, vector<string> groups, string group, string fasta, string name) {
try {
vector<int> processIDS;
processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
process++;
}else if (pid == 0){
- num = driverGroups(parser, outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups);
+ num = driverGroups(outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups);
//pass numSeqs to parent
ofstream out;
}
//do my part
- num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups);
+ num = driverGroups(outputFName, accnos, lines[0].start, lines[0].end, groups);
//force parent to wait until all the processes are done
for (int i=0;i<processIDS.size();i++) {
// Allocate memory for thread data.
string extension = toString(i) + ".temp";
- perseusData* tempPerseus = new perseusData(alpha, beta, cutoff, outputFName+extension, fasta, name, group, accnos+extension, groups, m, lines[i].start, lines[i].end, i);
+ perseusData* tempPerseus = new perseusData(hasName, hasCount, alpha, beta, cutoff, outputFName+extension, fasta, name, group, accnos+extension, groups, m, lines[i].start, lines[i].end, i);
pDataArray.push_back(tempPerseus);
processIDS.push_back(i);
//using the main process as a worker saves time and memory
- num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups);
+ num = driverGroups(outputFName, accnos, lines[0].start, lines[0].end, groups);
//Wait until all threads have terminated.
WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
}
}
//**********************************************************************************************************************
-int ChimeraPerseusCommand::deconvoluteResults(SequenceParser& parser, string outputFileName, string accnosFileName){
+int ChimeraPerseusCommand::deconvoluteResults(map<string, string>& uniqueNames, string outputFileName, string accnosFileName){
try {
- map<string, string> uniqueNames = parser.getAllSeqsMap();
map<string, string>::iterator itUnique;
int total = 0;
#include "mothur.h"
#include "command.hpp"
#include "sequenceparser.h"
+#include "sequencecountparser.h"
#include "myPerseus.h"
+#include "counttable.h"
/***********************************************************/
class ChimeraPerseusCommand : public Command {
linePair(int i, int j) : start(i), end(j) {}
};
- bool abort;
- string fastafile, groupfile, outputDir, namefile;
+ bool abort, hasName, hasCount;
+ string fastafile, groupfile, countfile, outputDir, namefile;
int processors, alignLength;
double cutoff, alpha, beta;
+ SequenceParser* parser;
+ SequenceCountParser* cparser;
vector<string> outputNames;
vector<string> fastaFileNames;
string getNamesFile(string&);
int driver(string, vector<seqData>&, string, int&);
vector<seqData> readFiles(string, string);
- vector<seqData> loadSequences(SequenceParser&, string);
- int deconvoluteResults(SequenceParser&, string, string);
- int driverGroups(SequenceParser&, string, string, int, int, vector<string>);
- int createProcessesGroups(SequenceParser&, string, string, vector<string>, string, string, string);
+ vector<seqData> readFiles(string inputFile, CountTable* ct);
+ vector<seqData> loadSequences(string);
+ int deconvoluteResults(map<string, string>&, string, string);
+ int driverGroups(string, string, int, int, vector<string>);
+ int createProcessesGroups(string, string, vector<string>, string, string, string);
};
/**************************************************************************************************/
MothurOut* m;
int start;
int end;
+ bool hasName, hasCount;
int threadID, count, numChimeras;
double alpha, beta, cutoff;
vector<string> groups;
perseusData(){}
- perseusData(double a, double b, double c, string o, string f, string n, string g, string ac, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
+ perseusData(bool hn, bool hc, double a, double b, double c, string o, string f, string n, string g, string ac, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
alpha = a;
beta = b;
cutoff = c;
end = en;
threadID = tid;
groups = gr;
+ hasName = hn;
+ hasCount = hc;
count = 0;
numChimeras = 0;
}
//parse fasta and name file by group
SequenceParser* parser;
- if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); }
- else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); }
-
+ SequenceCountParser* cparser;
+ if (pDataArray->hasCount) {
+ CountTable* ct = new CountTable();
+ ct->readTable(pDataArray->namefile);
+ cparser = new SequenceCountParser(pDataArray->fastafile, *ct);
+ delete ct;
+ }else {
+ if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); }
+ else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); }
+ }
+
int totalSeqs = 0;
int numChimeras = 0;
for (int i = pDataArray->start; i < pDataArray->end; i++) {
- int start = time(NULL); if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+ int start = time(NULL); if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[i] + "..."); pDataArray->m->mothurOutEndLine();
//vector<seqData> sequences = loadSequences(parser, groups[i]); - same function below
////////////////////////////////////////////////////////////////////////////////////////
- vector<Sequence> thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]);
- map<string, string> nameMap = parser->getNameMap(pDataArray->groups[i]);
- map<string, string>::iterator it;
-
- vector<seqData> sequences;
bool error = false;
-
- for (int j = 0; j < thisGroupsSeqs.size(); j++) {
-
- if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
-
- it = nameMap.find(thisGroupsSeqs[j].getName());
- if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[j].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
- else {
- int num = pDataArray->m->getNumNames(it->second);
- sequences.push_back(seqData(thisGroupsSeqs[j].getName(), thisGroupsSeqs[j].getUnaligned(), num));
- }
- }
+ int alignLength = 0;
+ vector<seqData> sequences;
+ if (pDataArray->hasCount) {
+ vector<Sequence> thisGroupsSeqs = cparser->getSeqs(pDataArray->groups[i]);
+ map<string, int> counts = cparser->getCountTable(pDataArray->groups[i]);
+ map<string, int>::iterator it;
+
+ for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ it = counts.find(thisGroupsSeqs[i].getName());
+ if (it == counts.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
+ else {
+ sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second));
+ if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
+ }
+ }
+ }else{
+ vector<Sequence> thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]);
+ map<string, string> nameMap = parser->getNameMap(pDataArray->groups[i]);
+ map<string, string>::iterator it;
+
+ for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ it = nameMap.find(thisGroupsSeqs[i].getName());
+ if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+ else {
+ int num = pDataArray->m->getNumNames(it->second);
+ sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
+ if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
+ }
+ }
+
+ }
+
if (error) { pDataArray->m->control_pressed = true; }
sort(sequences.rbegin(), sequences.rend());
////////////////////////////////////////////////////////////////////////////////////////
- if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+ if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
//int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below
////////////////////////////////////////////////////////////////////////////////////////
}
int numSeqs = sequences.size();
- int alignLength = sequences[0].sequence.size();
+ //int alignLength = sequences[0].sequence.size();
ofstream chimeraFile;
ofstream accnosFile;
for(int j=0;j<numSeqs;j++){
- if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+ if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
vector<bool> restricted = chimeras;
int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
- if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+ if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
if(comparisons >= 2){
minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
- if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+ if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
int minMismatchToTrimera = numeric_limits<int>::max();
int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
if(minMismatchToChimera >= 3 && comparisons >= 3){
minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
- if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+ if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
}
double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
- if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+ if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
string type;
string chimeraRefSeq;
chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
}
- if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+ if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
- if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+ if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq);
double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);
- if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+ if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t';
pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName);
pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[i] + "."); pDataArray->m->mothurOutEndLine();
- if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+ if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
}
pDataArray->count = totalSeqs;
- delete parser;
+ if (pDataArray->hasCount) { delete cparser; } { delete parser; }
return totalSeqs;
}
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["name"] = inputDir + it->second; }
}
+
+ it = parameters.find("count");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["count"] = inputDir + it->second; }
+ }
}
//check for required parameters
exit(1);
}
}
+/************************************************************/
+//add seqeunce without group info
+int CountTable::push_back(string seqName) {
+ try {
+ map<string, int>::iterator it = indexNameMap.find(seqName);
+ if (it == indexNameMap.end()) {
+ if (hasGroups) { m->mothurOut("[ERROR]: Your count table has groups and I have no group information for " + seqName + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+ indexNameMap[seqName] = uniques;
+ totals.push_back(1);
+ total++;
+ uniques++;
+ }else {
+ m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CountTable", "push_back");
+ exit(1);
+ }
+}
+/************************************************************/
+//add seqeunce without group info
+int CountTable::push_back(string seqName, int thisTotal) {
+ try {
+ map<string, int>::iterator it = indexNameMap.find(seqName);
+ if (it == indexNameMap.end()) {
+ if (hasGroups) { m->mothurOut("[ERROR]: Your count table has groups and I have no group information for " + seqName + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+ indexNameMap[seqName] = uniques;
+ totals.push_back(thisTotal);
+ total+=thisTotal;
+ uniques++;
+ }else {
+ m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CountTable", "push_back");
+ exit(1);
+ }
+}
+/************************************************************/
+//add sequence with group info
+int CountTable::push_back(string seqName, vector<int> groupCounts) {
+ try {
+ map<string, int>::iterator it = indexNameMap.find(seqName);
+ if (it == indexNameMap.end()) {
+ if ((hasGroups) && (groupCounts.size() != getNumGroups())) { m->mothurOut("[ERROR]: Your count table has a " + toString(getNumGroups()) + " groups and " + seqName + " has " + toString(groupCounts.size()) + ", please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+ int thisTotal = 0;
+ for (int i = 0; i < getNumGroups(); i++) { totalGroups[i] += groupCounts[i]; thisTotal += groupCounts[i]; }
+ indexNameMap[seqName] = uniques;
+ totals.push_back(thisTotal);
+ total+= thisTotal;
+ uniques++;
+ }else {
+ m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CountTable", "push_back");
+ exit(1);
+ }
+}
+
/************************************************************/
//create ListVector from uniques
ListVector CountTable::getListVector() {
bool hasGroupInfo() { return hasGroups; }
int getNumGroups() { return groups.size(); }
vector<string> getNamesOfGroups() { return groups; } //returns group names, if no group info vector is blank.
+
+ int push_back(string); //add a sequence
+ int push_back(string, int); //add a sequence
+ int push_back(string, vector<int>); //add a sequence with group info
+ int get(string); //returns unique sequence index for reading distance matrices like NameAssignment
+ int size() { return indexNameMap.size(); }
vector<int> getGroupCounts(string); //returns group counts for a seq passed in, if no group info is in file vector is blank. Order is the same as the groups returned by getGroups function.
int getGroupCount(string, string); //returns number of seqs for that group for that seq
int getNumSeqs() { return total; } //return total number of seqs
int getNumUniqueSeqs() { return uniques; } //return number of unique/representative seqs
int getGroupIndex(string); //returns index in getGroupCounts vector of specific group
+
vector<string> getNamesOfSeqs();
int mergeCounts(string, string); //combines counts for 2 seqs, saving under the first name passed in.
- int get(string); //returns unique sequence index for reading distance matrices like NameAssignment
ListVector getListVector();
- int size() { return indexNameMap.size(); }
private:
string filename;
}
}
/***********************************************************************/
+int ReadCluster::read(CountTable*& ct){
+ try {
+
+ if (format == "phylip") { convertPhylip2Column(ct); }
+ else { list = new ListVector(ct->getListVector()); }
+
+ if (m->control_pressed) { return 0; }
+
+ if (sortWanted) { OutPutFile = m->sortFile(distFile, outputDir); }
+ else { OutPutFile = distFile; } //for use by clusters splitMatrix to convert a phylip matrix to column
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ReadCluster", "read");
+ exit(1);
+ }
+}
+/***********************************************************************/
int ReadCluster::convertPhylip2Column(NameAssignment*& nameMap){
try {
}
/***********************************************************************/
+int ReadCluster::convertPhylip2Column(CountTable*& ct){
+ try {
+ //convert phylip file to column file
+ map<int, string> rowToName;
+ map<int, string>::iterator it;
+
+ ifstream in;
+ ofstream out;
+ string tempFile = distFile + ".column.temp";
+
+ m->openInputFile(distFile, in); m->gobble(in);
+ m->openOutputFile(tempFile, out);
+
+ float distance;
+ int square, nseqs;
+ string name;
+ vector<string> matrixNames;
+
+ string numTest;
+ in >> numTest >> name;
+
+ if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
+ else { convert(numTest, nseqs); }
+
+ rowToName[0] = name;
+ matrixNames.push_back(name);
+
+ if(ct == NULL){
+ list = new ListVector(nseqs);
+ list->set(0, name);
+ }
+ else{ list = new ListVector(ct->getListVector()); }
+
+ char d;
+ while((d=in.get()) != EOF){
+
+ if(isalnum(d)){
+ square = 1;
+ in.putback(d);
+ for(int i=0;i<nseqs;i++){
+ in >> distance;
+ }
+ break;
+ }
+ if(d == '\n'){
+ square = 0;
+ break;
+ }
+ }
+
+ if(square == 0){
+
+ for(int i=1;i<nseqs;i++){
+ in >> name;
+ rowToName[i] = name;
+ matrixNames.push_back(name);
+
+ //there's A LOT of repeated code throughout this method...
+ if(ct == NULL){
+ list->set(i, name);
+
+ for(int j=0;j<i;j++){
+
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; }
+
+ in >> distance;
+
+ if (distance == -1) { distance = 1000000; }
+
+ if(distance < cutoff){
+ out << i << '\t' << j << '\t' << distance << endl;
+ }
+ }
+
+ }
+ else{
+
+ for(int j=0;j<i;j++){
+
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; }
+
+ in >> distance;
+
+ if (distance == -1) { distance = 1000000; }
+
+ if(distance < cutoff){
+ out << i << '\t' << j << '\t' << distance << endl;
+ }
+ }
+ }
+ }
+ }
+ else{
+ for(int i=1;i<nseqs;i++){
+ in >> name;
+ rowToName[i] = name;
+ matrixNames.push_back(name);
+
+ if(ct == NULL){
+ list->set(i, name);
+ for(int j=0;j<nseqs;j++){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; }
+
+ in >> distance;
+
+ if (distance == -1) { distance = 1000000; }
+
+ if(distance < cutoff && j < i){
+ out << i << '\t' << j << '\t' << distance << endl;
+ }
+ }
+ }
+ else{
+ for(int j=0;j<nseqs;j++){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; }
+
+ in >> distance;
+
+ if (distance == -1) { distance = 1000000; }
+
+ if(distance < cutoff && j < i){
+ out << i << '\t' << j << '\t' << distance << endl;
+ }
+
+ }
+ }
+ }
+ }
+
+ list->setLabel("0");
+ in.close();
+ out.close();
+
+ if(ct == NULL){
+ ct = new CountTable();
+ for(int i=0;i<matrixNames.size();i++){
+ ct->push_back(matrixNames[i]);
+ }
+ }
+
+
+ ifstream in2;
+ ofstream out2;
+
+ string outputFile = m->getRootName(distFile) + "column.dist";
+ m->openInputFile(tempFile, in2);
+ m->openOutputFile(outputFile, out2);
+
+ int first, second;
+ float dist;
+
+ while (in2) {
+ if (m->control_pressed) { in2.close(); out2.close(); m->mothurRemove(tempFile); m->mothurRemove(outputFile); return 0; }
+
+ in2 >> first >> second >> dist;
+ out2 << rowToName[first] << '\t' << rowToName[second] << '\t' << dist << endl;
+ m->gobble(in2);
+ }
+ in2.close();
+ out2.close();
+
+ m->mothurRemove(tempFile);
+ distFile = outputFile;
+
+ if (m->control_pressed) { m->mothurRemove(outputFile); }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ReadCluster", "convertPhylip2Column");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
ReadCluster::~ReadCluster(){}
/***********************************************************************/
#include "mothur.h"
#include "nameassignment.hpp"
#include "listvector.hpp"
+#include "counttable.h"
/******************************************************/
ReadCluster(string, float, string, bool);
~ReadCluster();
int read(NameAssignment*&);
+ int read(CountTable*&);
string getOutputFile() { return OutPutFile; }
void setFormat(string f) { format = f; }
ListVector* getListVector() { return list; }
bool sortWanted;
int convertPhylip2Column(NameAssignment*&);
+ int convertPhylip2Column(CountTable*&);
};
/******************************************************/
--- /dev/null
+//
+// sequencecountparser.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 8/7/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "sequencecountparser.h"
+
+/************************************************************/
+SequenceCountParser::SequenceCountParser(string countfile, string fastafile) {
+ try {
+
+ m = MothurOut::getInstance();
+
+ //read count file
+ CountTable countTable;
+ countTable.readTable(countfile);
+
+ //initialize maps
+ namesOfGroups = countTable.getNamesOfGroups();
+ for (int i = 0; i < namesOfGroups.size(); i++) {
+ vector<Sequence> temp;
+ map<string, int> tempMap;
+ seqs[namesOfGroups[i]] = temp;
+ countTablePerGroup[namesOfGroups[i]] = tempMap;
+ }
+
+ //read fasta file making sure each sequence is in the group file
+ ifstream in;
+ m->openInputFile(fastafile, in);
+
+ int fastaCount = 0;
+ while (!in.eof()) {
+
+ if (m->control_pressed) { break; }
+
+ Sequence seq(in); m->gobble(in);
+ fastaCount++;
+ if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } }
+
+ if (seq.getName() != "") {
+
+ allSeqsMap[seq.getName()] = seq.getName();
+ vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
+
+ for (int i = 0; i < namesOfGroups.size(); i++) {
+ if (groupCounts[i] != 0) {
+ seqs[namesOfGroups[i]].push_back(seq);
+ countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
+ }
+ }
+ }
+ }
+ in.close();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceCountParser", "SequenceCountParser");
+ exit(1);
+ }
+}
+/************************************************************/
+SequenceCountParser::SequenceCountParser(string fastafile, CountTable& countTable) {
+ try {
+
+ m = MothurOut::getInstance();
+
+ //initialize maps
+ if (countTable.hasGroupInfo()) {
+ namesOfGroups = countTable.getNamesOfGroups();
+ for (int i = 0; i < namesOfGroups.size(); i++) {
+ vector<Sequence> temp;
+ map<string, int> tempMap;
+ seqs[namesOfGroups[i]] = temp;
+ countTablePerGroup[namesOfGroups[i]] = tempMap;
+ }
+
+ //read fasta file making sure each sequence is in the group file
+ ifstream in;
+ m->openInputFile(fastafile, in);
+
+ int fastaCount = 0;
+ while (!in.eof()) {
+
+ if (m->control_pressed) { break; }
+
+ Sequence seq(in); m->gobble(in);
+ fastaCount++;
+ if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } }
+
+ if (seq.getName() != "") {
+
+ allSeqsMap[seq.getName()] = seq.getName();
+ vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
+
+ for (int i = 0; i < namesOfGroups.size(); i++) {
+ if (groupCounts[i] != 0) {
+ seqs[namesOfGroups[i]].push_back(seq);
+ countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
+ }
+ }
+ }
+ }
+ in.close();
+ }else { m->control_pressed = true; m->mothurOut("[ERROR]: cannot parse fasta file by group with a count table that does not include group data, please correct.\n"); }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceCountParser", "SequenceCountParser");
+ exit(1);
+ }
+}
+/************************************************************/
+SequenceCountParser::~SequenceCountParser(){ }
+/************************************************************/
+int SequenceCountParser::getNumGroups(){ return namesOfGroups.size(); }
+/************************************************************/
+vector<string> SequenceCountParser::getNamesOfGroups(){ return namesOfGroups; }
+/************************************************************/
+int SequenceCountParser::getNumSeqs(string g){
+ try {
+ map<string, vector<Sequence> >::iterator it;
+ int num = 0;
+
+ it = seqs.find(g);
+ if(it == seqs.end()) {
+ m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
+ }else {
+ num = (it->second).size();
+ }
+
+ return num;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceCountParser", "getNumSeqs");
+ exit(1);
+ }
+}
+/************************************************************/
+vector<Sequence> SequenceCountParser::getSeqs(string g){
+ try {
+ map<string, vector<Sequence> >::iterator it;
+ vector<Sequence> seqForThisGroup;
+
+ it = seqs.find(g);
+ if(it == seqs.end()) {
+ m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
+ }else {
+ seqForThisGroup = it->second;
+ if (m->debug) { m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences."); }
+ }
+
+ return seqForThisGroup;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceCountParser", "getSeqs");
+ exit(1);
+ }
+}
+/************************************************************/
+int SequenceCountParser::getSeqs(string g, string filename, bool uchimeFormat=false){
+ try {
+ map<string, vector<Sequence> >::iterator it;
+ vector<Sequence> seqForThisGroup;
+ vector<seqPriorityNode> nameVector;
+
+ it = seqs.find(g);
+ if(it == seqs.end()) {
+ m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
+ }else {
+
+ ofstream out;
+ m->openOutputFile(filename, out);
+
+ seqForThisGroup = it->second;
+
+ if (uchimeFormat) {
+ // format should look like
+ //>seqName /ab=numRedundantSeqs/
+ //sequence
+
+ map<string, int> countForThisGroup = getCountTable(g);
+ map<string, int>::iterator itCount;
+ int error = 0;
+
+ for (int i = 0; i < seqForThisGroup.size(); i++) {
+ itCount = countForThisGroup.find(seqForThisGroup[i].getName());
+
+ if (itCount == countForThisGroup.end()){
+ error = 1;
+ m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your count file, please correct."); m->mothurOutEndLine();
+ }else {
+ seqPriorityNode temp(itCount->second, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
+ nameVector.push_back(temp);
+ }
+ }
+
+ if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
+
+ //sort by num represented
+ sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
+
+ //print new file in order of
+ for (int i = 0; i < nameVector.size(); i++) {
+
+ if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+
+ out << ">" << nameVector[i].name << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl;
+ }
+
+ }else {
+ //m->mothurOut("Group " + g + " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n");
+ for (int i = 0; i < seqForThisGroup.size(); i++) {
+
+ if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+
+ seqForThisGroup[i].printSequence(out);
+ }
+ }
+ out.close();
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceCountParser", "getSeqs");
+ exit(1);
+ }
+}
+
+/************************************************************/
+map<string, int> SequenceCountParser::getCountTable(string g){
+ try {
+ map<string, map<string, int> >::iterator it;
+ map<string, int> countForThisGroup;
+
+ it = countTablePerGroup.find(g);
+ if(it == countTablePerGroup.end()) {
+ m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine();
+ }else {
+ countForThisGroup = it->second;
+ if (m->debug) { m->mothurOut("[DEBUG]: group " + g + " count file has " + toString(countForThisGroup.size()) + " unique sequences."); }
+ }
+
+ return countForThisGroup;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceCountParser", "getCountTable");
+ exit(1);
+ }
+}
+/************************************************************/
+int SequenceCountParser::getCountTable(string g, string filename){
+ try {
+ map<string, map<string, int> >::iterator it;
+ map<string, int> countForThisGroup;
+
+ it = countTablePerGroup.find(g);
+ if(it == countTablePerGroup.end()) {
+ m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine();
+ }else {
+ countForThisGroup = it->second;
+
+ ofstream out;
+ m->openOutputFile(filename, out);
+ out << "Representative_Sequence\ttotal\t" << g << endl;
+
+ for (map<string, int>::iterator itFile = countForThisGroup.begin(); itFile != countForThisGroup.end(); itFile++) {
+
+ if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+
+ out << itFile->first << '\t' << itFile->second << '\t' << itFile->second << endl;
+ }
+
+ out.close();
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceParser", "getCountTable");
+ exit(1);
+ }
+}
+/************************************************************/
+
+
+
--- /dev/null
+#ifndef Mothur_sequencecountparser_h
+#define Mothur_sequencecountparser_h
+
+//
+// sequencecountparser.h
+// Mothur
+//
+// Created by Sarah Westcott on 8/7/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "mothur.h"
+#include "mothurout.h"
+#include "sequence.hpp"
+#include "counttable.h"
+
+/* This class reads a fasta and count file and parses the data by group. The countfile must contain group information.
+
+ Note: The sum of all the groups unique sequences will be larger than the original number of unique sequences.
+ This is because when we parse the count file we make a unique for each group instead of 1 unique for all
+ groups.
+
+ */
+
+class SequenceCountParser {
+
+public:
+
+ SequenceCountParser(string, string); //count, fasta - file mismatches will set m->control_pressed = true
+ SequenceCountParser(string, CountTable&); //fasta, counttable - file mismatches will set m->control_pressed = true
+ ~SequenceCountParser();
+
+ //general operations
+ int getNumGroups();
+ vector<string> getNamesOfGroups();
+
+ int getNumSeqs(string); //returns the number of unique sequences in a specific group
+ vector<Sequence> getSeqs(string); //returns unique sequences in a specific group
+ map<string, int> getCountTable(string); //returns seqName -> numberOfRedundantSeqs for a specific group - the count file format, but each line is parsed by group.
+
+ int getSeqs(string, string, bool); //prints unique sequences in a specific group to a file - group, filename, uchimeFormat=false
+ int getCountTable(string, string); //print seqName -> numberRedundantSeqs for a specific group - group, filename
+
+ map<string, string> getAllSeqsMap(){ return allSeqsMap; } //returns map where the key=sequenceName and the value=representativeSequence - helps us remove duplicates after group by group processing
+private:
+
+ CountTable countTable;
+ MothurOut* m;
+
+ int numSeqs;
+ map<string, string> allSeqsMap;
+ map<string, vector<Sequence> > seqs; //a vector for each group
+ map<string, map<string, int> > countTablePerGroup; //countTable for each group
+ vector<string> namesOfGroups;
+};
+
+
+
+#endif
/************************************************************/
bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
/************************************************************/
-string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
-/************************************************************/
int SequenceParser::getNumSeqs(string g){
try {
map<string, vector<Sequence> >::iterator it;
int getNumGroups();
vector<string> getNamesOfGroups();
bool isValidGroup(string); //return true if string is a valid group
- string getGroup(string); //returns group of a specific sequence
int getNumSeqs(string); //returns the number of unique sequences in a specific group
vector<Sequence> getSeqs(string); //returns unique sequences in a specific group