A795840D13F13CD900F201D5 /* countgroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A795840C13F13CD900F201D5 /* countgroupscommand.cpp */; };
A799F5B91309A3E000AEEFA0 /* makefastqcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A799F5B81309A3E000AEEFA0 /* makefastqcommand.cpp */; };
A7A61F2D130062E000E05B6B /* amovacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A61F2C130062E000E05B6B /* amovacommand.cpp */; };
+ A7BF221414587886000AD524 /* myPerseus.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF221214587886000AD524 /* myPerseus.cpp */; };
+ A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */; };
A7E9B88112D37EC400DA6239 /* ace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B64F12D37EC300DA6239 /* ace.cpp */; };
A7E9B88212D37EC400DA6239 /* aligncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65112D37EC300DA6239 /* aligncommand.cpp */; };
A7E9B88312D37EC400DA6239 /* alignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65312D37EC300DA6239 /* alignment.cpp */; };
A7A61F2B130062E000E05B6B /* amovacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = amovacommand.h; sourceTree = "<group>"; };
A7A61F2C130062E000E05B6B /* amovacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = amovacommand.cpp; sourceTree = "<group>"; };
A7AACFBA132FE008003D6C4D /* currentfile.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = currentfile.h; sourceTree = "<group>"; };
+ A7BF221214587886000AD524 /* myPerseus.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = myPerseus.cpp; sourceTree = "<group>"; };
+ A7BF221314587886000AD524 /* myPerseus.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = myPerseus.h; sourceTree = "<group>"; };
+ A7BF2230145879B2000AD524 /* chimeraperseuscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraperseuscommand.h; sourceTree = "<group>"; };
+ A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraperseuscommand.cpp; sourceTree = "<group>"; };
A7DAAFA3133A254E003956EB /* commandparameter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = commandparameter.h; sourceTree = "<group>"; };
A7E9B64F12D37EC300DA6239 /* ace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ace.cpp; sourceTree = "<group>"; };
A7E9B65012D37EC300DA6239 /* ace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ace.h; sourceTree = "<group>"; };
A7E9B67E12D37EC400DA6239 /* chimeracheckcommand.cpp */,
A7E9B68312D37EC400DA6239 /* chimerapintailcommand.h */,
A7E9B68212D37EC400DA6239 /* chimerapintailcommand.cpp */,
+ A7BF2230145879B2000AD524 /* chimeraperseuscommand.h */,
+ A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */,
A7E9B68B12D37EC400DA6239 /* chimeraslayercommand.h */,
A7E9B68A12D37EC400DA6239 /* chimeraslayercommand.cpp */,
A74D36B6137DAFAA00332B0C /* chimerauchimecommand.h */,
A7E9B68912D37EC400DA6239 /* chimeraslayer.h */,
A7E9B74612D37EC400DA6239 /* maligner.h */,
A7E9B74512D37EC400DA6239 /* maligner.cpp */,
+ A7BF221314587886000AD524 /* myPerseus.h */,
+ A7BF221214587886000AD524 /* myPerseus.cpp */,
A7E9B79312D37EC400DA6239 /* pintail.cpp */,
A7E9B79412D37EC400DA6239 /* pintail.h */,
A7E9B82E12D37EC400DA6239 /* slayer.cpp */,
A7FF19F2140FFDA500AD216D /* trimoligos.cpp in Sources */,
A7F9F5CF141A5E500032F693 /* sequenceparser.cpp in Sources */,
A7FFB558142CA02C004884F2 /* summarytaxcommand.cpp in Sources */,
+ A7BF221414587886000AD524 /* myPerseus.cpp in Sources */,
+ A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
--- /dev/null
+/*
+ * chimeraperseuscommand.cpp
+ * Mothur
+ *
+ * Created by westcott on 10/26/11.
+ * Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "chimeraperseuscommand.h"
+#include "deconvolutecommand.h"
+#include "sequence.hpp"
+//**********************************************************************************************************************
+vector<string> ChimeraPerseusCommand::setParameters(){
+ try {
+ CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
+ CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
+ CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+ CommandParameter pcutoff("cutoff", "Number", "", "0.5", "", "", "",false,false); parameters.push_back(pcutoff);
+ CommandParameter palpha("alpha", "Number", "", "-5.54", "", "", "",false,false); parameters.push_back(palpha);
+ CommandParameter pbeta("beta", "Number", "", "0.33", "", "", "",false,false); parameters.push_back(pbeta);
+
+ vector<string> myArray;
+ for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
+ return myArray;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "setParameters");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string ChimeraPerseusCommand::getHelpString(){
+ try {
+ string helpString = "";
+ helpString += "The chimera.perseus command reads a fastafile and namefile and outputs potentially chimeric sequences.\n";
+ helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n";
+ helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
+ helpString += "The name parameter allows you to provide a name file associated with your fasta file. If none is given unique.seqs will be run to generate one. \n";
+ helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
+ helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
+ helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
+ helpString += "The alpha parameter .... The default is -5.54. \n";
+ helpString += "The beta parameter .... The default is 0.33. \n";
+ helpString += "The cutoff parameter .... The default is 0.50. \n";
+ helpString += "The chimera.perseus command should be in the following format: \n";
+ helpString += "chimera.perseus(fasta=yourFastaFile, name=yourNameFile) \n";
+ helpString += "Example: chimera.perseus(fasta=AD.align, name=AD.names) \n";
+ helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
+ return helpString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "getHelpString");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+ChimeraPerseusCommand::ChimeraPerseusCommand(){
+ try {
+ abort = true; calledHelp = true;
+ setParameters();
+ vector<string> tempOutNames;
+ outputTypes["chimera"] = tempOutNames;
+ outputTypes["accnos"] = tempOutNames;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "ChimeraPerseusCommand");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+ChimeraPerseusCommand::ChimeraPerseusCommand(string option) {
+ try {
+ abort = false; calledHelp = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; calledHelp = true; }
+ else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+
+ else {
+ vector<string> myArray = setParameters();
+
+ OptionParser parser(option);
+ map<string,string> parameters = parser.getParameters();
+
+ ValidParameters validParameter("chimera.uchime");
+ map<string,string>::iterator it;
+
+ //check to make sure all parameters are valid for command
+ for (it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ vector<string> tempOutNames;
+ outputTypes["chimera"] = tempOutNames;
+ outputTypes["accnos"] = tempOutNames;
+
+ //if the user changes the input directory command factory will send this info to us in the output parameter
+ string inputDir = validParameter.validFile(parameters, "inputdir", false);
+ if (inputDir == "not found"){ inputDir = ""; }
+
+ //check for required parameters
+ fastafile = validParameter.validFile(parameters, "fasta", false);
+ if (fastafile == "not found") {
+ //if there is a current fasta file, use it
+ string filename = m->getFastaFile();
+ if (filename != "") { fastaFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
+ }else {
+ m->splitAtDash(fastafile, fastaFileNames);
+
+ //go through files and make sure they are good, if not, then disregard them
+ for (int i = 0; i < fastaFileNames.size(); i++) {
+
+ bool ignore = false;
+ if (fastaFileNames[i] == "current") {
+ fastaFileNames[i] = m->getFastaFile();
+ if (fastaFileNames[i] != "") { m->mothurOut("Using " + fastaFileNames[i] + " as input file for the fasta parameter where you had given current."); m->mothurOutEndLine(); }
+ else {
+ m->mothurOut("You have no current fastafile, ignoring current."); m->mothurOutEndLine(); ignore=true;
+ //erase from file list
+ fastaFileNames.erase(fastaFileNames.begin()+i);
+ i--;
+ }
+ }
+
+ if (!ignore) {
+
+ if (inputDir != "") {
+ string path = m->hasPath(fastaFileNames[i]);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; }
+ }
+
+ int ableToOpen;
+ ifstream in;
+
+ ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror");
+
+ //if you can't open it, try default location
+ if (ableToOpen == 1) {
+ if (m->getDefaultPath() != "") { //default path is set
+ string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]);
+ m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ fastaFileNames[i] = tryPath;
+ }
+ }
+
+ if (ableToOpen == 1) {
+ if (m->getOutputDir() != "") { //default path is set
+ string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]);
+ m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ fastaFileNames[i] = tryPath;
+ }
+ }
+
+ in.close();
+
+ if (ableToOpen == 1) {
+ m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
+ //erase from file list
+ fastaFileNames.erase(fastaFileNames.begin()+i);
+ i--;
+ }else {
+ m->setFastaFile(fastaFileNames[i]);
+ }
+ }
+ }
+
+ //make sure there is at least one valid file left
+ if (fastaFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
+ }
+
+
+ //check for required parameters
+ bool hasName = true;
+ namefile = validParameter.validFile(parameters, "name", false);
+ if (namefile == "not found") {
+ //if there is a current fasta file, use it
+ string filename = m->getNameFile();
+ if (filename != "") { nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
+ hasName = false;
+ }else {
+ m->splitAtDash(namefile, nameFileNames);
+
+ //go through files and make sure they are good, if not, then disregard them
+ for (int i = 0; i < nameFileNames.size(); i++) {
+
+ bool ignore = false;
+ if (nameFileNames[i] == "current") {
+ nameFileNames[i] = m->getNameFile();
+ if (nameFileNames[i] != "") { m->mothurOut("Using " + nameFileNames[i] + " as input file for the name parameter where you had given current."); m->mothurOutEndLine(); }
+ else {
+ m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true;
+ //erase from file list
+ nameFileNames.erase(nameFileNames.begin()+i);
+ i--;
+ }
+ }
+
+ if (!ignore) {
+
+ if (inputDir != "") {
+ string path = m->hasPath(nameFileNames[i]);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { nameFileNames[i] = inputDir + nameFileNames[i]; }
+ }
+
+ int ableToOpen;
+ ifstream in;
+
+ ableToOpen = m->openInputFile(nameFileNames[i], in, "noerror");
+
+ //if you can't open it, try default location
+ if (ableToOpen == 1) {
+ if (m->getDefaultPath() != "") { //default path is set
+ string tryPath = m->getDefaultPath() + m->getSimpleName(nameFileNames[i]);
+ m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ nameFileNames[i] = tryPath;
+ }
+ }
+
+ if (ableToOpen == 1) {
+ if (m->getOutputDir() != "") { //default path is set
+ string tryPath = m->getOutputDir() + m->getSimpleName(nameFileNames[i]);
+ m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ nameFileNames[i] = tryPath;
+ }
+ }
+
+ in.close();
+
+ if (ableToOpen == 1) {
+ m->mothurOut("Unable to open " + nameFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
+ //erase from file list
+ nameFileNames.erase(nameFileNames.begin()+i);
+ i--;
+ }else {
+ m->setNameFile(nameFileNames[i]);
+ }
+ }
+ }
+
+ //make sure there is at least one valid file left
+ if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; }
+ }
+
+ if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
+
+ bool hasGroup = true;
+ groupfile = validParameter.validFile(parameters, "group", false);
+ if (groupfile == "not found") { groupfile = ""; hasGroup = false; }
+ else {
+ m->splitAtDash(groupfile, groupFileNames);
+
+ //go through files and make sure they are good, if not, then disregard them
+ for (int i = 0; i < groupFileNames.size(); i++) {
+
+ bool ignore = false;
+ if (groupFileNames[i] == "current") {
+ groupFileNames[i] = m->getGroupFile();
+ if (groupFileNames[i] != "") { m->mothurOut("Using " + groupFileNames[i] + " as input file for the group parameter where you had given current."); m->mothurOutEndLine(); }
+ else {
+ m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true;
+ //erase from file list
+ groupFileNames.erase(groupFileNames.begin()+i);
+ i--;
+ }
+ }
+
+ if (!ignore) {
+
+ if (inputDir != "") {
+ string path = m->hasPath(groupFileNames[i]);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { groupFileNames[i] = inputDir + groupFileNames[i]; }
+ }
+
+ int ableToOpen;
+ ifstream in;
+
+ ableToOpen = m->openInputFile(groupFileNames[i], in, "noerror");
+
+ //if you can't open it, try default location
+ if (ableToOpen == 1) {
+ if (m->getDefaultPath() != "") { //default path is set
+ string tryPath = m->getDefaultPath() + m->getSimpleName(groupFileNames[i]);
+ m->mothurOut("Unable to open " + groupFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ groupFileNames[i] = tryPath;
+ }
+ }
+
+ if (ableToOpen == 1) {
+ if (m->getOutputDir() != "") { //default path is set
+ string tryPath = m->getOutputDir() + m->getSimpleName(groupFileNames[i]);
+ m->mothurOut("Unable to open " + groupFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ groupFileNames[i] = tryPath;
+ }
+ }
+
+ in.close();
+
+ if (ableToOpen == 1) {
+ m->mothurOut("Unable to open " + groupFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
+ //erase from file list
+ groupFileNames.erase(groupFileNames.begin()+i);
+ i--;
+ }else {
+ m->setGroupFile(groupFileNames[i]);
+ }
+ }
+ }
+
+ //make sure there is at least one valid file left
+ if (groupFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid group files."); m->mothurOutEndLine(); abort = true; }
+ }
+
+ if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
+
+
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+
+ string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
+ m->setProcessors(temp);
+ convert(temp, processors);
+
+ temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.50"; }
+ convert(temp, cutoff);
+
+ temp = validParameter.validFile(parameters, "alpha", false); if (temp == "not found"){ temp = "-5.54"; }
+ convert(temp, alpha);
+
+ temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.33"; }
+ convert(temp, beta);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "ChimeraPerseusCommand");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+
+int ChimeraPerseusCommand::execute(){
+ try{
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+
+ //process each file
+ for (int s = 0; s < fastaFileNames.size(); s++) {
+
+ m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
+
+ int start = time(NULL);
+ if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it
+ string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "perseus.chimera";
+ string accnosFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "perseus.accnos";
+ //string newFasta = m->getRootName(fastaFileNames[s]) + "temp";
+
+ //you provided a groupfile
+ string groupFile = "";
+ if (groupFileNames.size() != 0) { groupFile = groupFileNames[s]; }
+
+ string nameFile = "";
+ if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one
+ nameFile = nameFileNames[s];
+ }else { nameFile = getNamesFile(fastaFileNames[s]); }
+
+ if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+
+ int numSeqs = 0;
+ int numChimeras = 0;
+
+ if (groupFile != "") {
+ //Parse sequences by group
+ SequenceParser parser(groupFile, fastaFileNames[s], nameFile);
+ vector<string> groups = parser.getNamesOfGroups();
+
+ if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+
+ //clears files
+ ofstream out, out1, out2;
+ m->openOutputFile(outputFileName, out); out.close();
+ m->openOutputFile(accnosFileName, out1); out1.close();
+
+ if(processors == 1) { numSeqs = driverGroups(parser, outputFileName, accnosFileName, 0, groups.size(), groups); }
+ else { numSeqs = createProcessesGroups(parser, outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); }
+
+ if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+
+ numChimeras = deconvoluteResults(parser, outputFileName, accnosFileName);
+
+ m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine();
+
+ if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+
+ }else{
+ if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
+
+ //read sequences and store sorted by frequency
+ vector<seqData> sequences = readFiles(fastaFileNames[s], nameFile);
+
+ if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+
+ numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras);
+ }
+
+ if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+
+ m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences. " + toString(numChimeras) + " chimeras were found."); m->mothurOutEndLine();
+ outputNames.push_back(outputFileName); outputTypes["chimera"].push_back(outputFileName);
+ outputNames.push_back(accnosFileName); outputTypes["accnos"].push_back(accnosFileName);
+ }
+
+ //set accnos file as new current accnosfile
+ string current = "";
+ itTypes = outputTypes.find("accnos");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
+ }
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "execute");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string ChimeraPerseusCommand::getNamesFile(string& inputFile){
+ try {
+ string nameFile = "";
+
+ m->mothurOutEndLine(); m->mothurOut("No namesfile given, running unique.seqs command to generate one."); m->mothurOutEndLine(); m->mothurOutEndLine();
+
+ //use unique.seqs to create new name and fastafile
+ string inputString = "fasta=" + inputFile;
+ m->mothurOut("/******************************************/"); m->mothurOutEndLine();
+ m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
+
+ Command* uniqueCommand = new DeconvoluteCommand(inputString);
+ uniqueCommand->execute();
+
+ map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
+
+ delete uniqueCommand;
+
+ m->mothurOut("/******************************************/"); m->mothurOutEndLine();
+
+ nameFile = filenames["name"][0];
+ inputFile = filenames["fasta"][0];
+
+ return nameFile;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFName, string accnos, int start, int end, vector<string> groups){
+ try {
+
+ int totalSeqs = 0;
+ int numChimeras = 0;
+
+ for (int i = start; i < end; i++) {
+
+ m->mothurOutEndLine(); m->mothurOut("Checking sequences from group " + groups[i] + "..."); m->mothurOutEndLine();
+
+ int start = time(NULL); if (m->control_pressed) { return 0; }
+
+ vector<seqData> sequences = loadSequences(parser, groups[i]);
+
+ if (m->control_pressed) { return 0; }
+
+ int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras);
+ totalSeqs += numSeqs;
+
+ if (m->control_pressed) { return 0; }
+
+ //append files
+ m->appendFiles((outputFName+groups[i]), outputFName); m->mothurRemove((outputFName+groups[i]));
+ m->appendFiles((accnos+groups[i]), accnos); m->mothurRemove((accnos+groups[i]));
+
+ m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + groups[i] + "."); m->mothurOutEndLine();
+ }
+
+ return totalSeqs;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "driverGroups");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<seqData> ChimeraPerseusCommand::loadSequences(SequenceParser& parser, string group){
+ try {
+
+ vector<Sequence> thisGroupsSeqs = parser.getSeqs(group);
+ map<string, string> nameMap = parser.getNameMap(group);
+ map<string, string>::iterator it;
+
+ vector<seqData> sequences;
+ bool error = false;
+
+ for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+
+ if (m->control_pressed) { return sequences; }
+
+ it = nameMap.find(thisGroupsSeqs[i].getName());
+ if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
+ else {
+ int num = m->getNumNames(it->second);
+ sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
+ }
+ }
+
+ if (error) { m->control_pressed = true; }
+
+ //sort by frequency
+ sort(sequences.rbegin(), sequences.rend());
+
+ return sequences;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "loadSequences");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, string name){
+ try {
+ map<string, int>::iterator it;
+ map<string, int> nameMap = m->readNames(name);
+
+ //read fasta file and create sequenceData structure - checking for file mismatches
+ vector<seqData> sequences;
+ bool error = false;
+ ifstream in;
+ m->openInputFile(inputFile, in);
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); return sequences; }
+
+ Sequence temp(in); m->gobble(in);
+
+ it = nameMap.find(temp.getName());
+ if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + temp.getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
+ else {
+ sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), it->second));
+ }
+ }
+ in.close();
+
+ if (error) { m->control_pressed = true; }
+
+ //sort by frequency
+ sort(sequences.rbegin(), sequences.rend());
+
+ return sequences;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int ChimeraPerseusCommand::driver(string chimeraFileName, vector<seqData>& sequences, string accnosFileName, int& numChimeras){
+ try {
+
+ vector<vector<double> > correctModel(4); //could be an option in the future to input own model matrix
+ for(int i=0;i<4;i++){ correctModel[i].resize(4); }
+
+ correctModel[0][0] = 0.000000; //AA
+ correctModel[1][0] = 11.619259; //CA
+ correctModel[2][0] = 11.694004; //TA
+ correctModel[3][0] = 7.748623; //GA
+
+ correctModel[1][1] = 0.000000; //CC
+ correctModel[2][1] = 7.619657; //TC
+ correctModel[3][1] = 12.852562; //GC
+
+ correctModel[2][2] = 0.000000; //TT
+ correctModel[3][2] = 10.964048; //TG
+
+ correctModel[3][3] = 0.000000; //GG
+
+ for(int i=0;i<4;i++){
+ for(int j=0;j<i;j++){
+ correctModel[j][i] = correctModel[i][j];
+ }
+ }
+
+ int numSeqs = sequences.size();
+ int alignLength = sequences[0].sequence.size();
+
+ ofstream chimeraFile;
+ ofstream accnosFile;
+ m->openOutputFile(chimeraFileName, chimeraFile);
+ m->openOutputFile(accnosFileName, accnosFile);
+
+ Perseus myPerseus;
+ vector<vector<double> > binMatrix = myPerseus.binomial(alignLength);
+
+ chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n";
+
+ vector<bool> chimeras(numSeqs, 0);
+
+ for(int i=0;i<numSeqs;i++){
+ cout << sequences[i].seqName << endl << sequences[i].sequence << endl << sequences[i].frequency << endl;
+ if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
+
+ vector<bool> restricted = chimeras;
+
+ vector<vector<int> > leftDiffs(numSeqs);
+ vector<vector<int> > leftMaps(numSeqs);
+ vector<vector<int> > rightDiffs(numSeqs);
+ vector<vector<int> > rightMaps(numSeqs);
+
+ vector<int> singleLeft, bestLeft;
+ vector<int> singleRight, bestRight;
+
+ int bestSingleIndex, bestSingleDiff;
+ vector<pwAlign> alignments(numSeqs);
+
+ int comparisons = myPerseus.getAlignments(i, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
+ cout << "comparisons = " << comparisons << endl;
+ if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
+
+ int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
+
+ string dummyA, dummyB;
+
+ if(comparisons >= 2){
+ minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
+ cout << "minMismatchToChimera = " << minMismatchToChimera << endl;
+ if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
+
+ int minMismatchToTrimera = numeric_limits<int>::max();
+ int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
+
+ if(minMismatchToChimera >= 3 && comparisons >= 3){
+ minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
+ cout << "minMismatchToTrimera = " << minMismatchToTrimera << endl;
+ if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
+ }
+
+ double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
+ cout << "singleDist = " << singleDist << endl;
+ if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
+
+ string type;
+ string chimeraRefSeq;
+
+ if(minMismatchToChimera - minMismatchToTrimera >= 3){
+ type = "trimera";
+ chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps);
+ }
+ else{
+ type = "chimera";
+ chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
+ }
+ cout << "chimeraRefSeq = " << chimeraRefSeq << endl;
+ if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
+
+ double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
+ cout << "chimeraDist = " << chimeraDist << endl;
+ if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
+
+ double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq);
+ double loonIndex = myPerseus.calcLoonIndex(sequences[i].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);
+ cout << "loonIndex = " << loonIndex << endl;
+ if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
+
+ chimeraFile << i << '\t' << sequences[i].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
+ chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t';
+ chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t';
+ chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t';
+
+ double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, alpha, beta);
+
+ chimeraFile << probability << '\t';
+
+ if(probability > cutoff){
+ chimeraFile << type << endl;
+ accnosFile << sequences[i].seqName << endl;
+ chimeras[i] = 1;
+ numChimeras++;
+ }
+ else{
+ chimeraFile << "good" << endl;
+ }
+
+ }
+ else{
+ chimeraFile << i << '\t' << sequences[i].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
+ }
+
+ //report progress
+ if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1) + "\n"); }
+ }
+
+ if((numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n"); }
+
+ chimeraFile.close();
+ accnosFile.close();
+
+ return numSeqs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "driver");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string accnos, vector<string> groups, string group, string fasta, string name) {
+ try {
+
+ vector<int> processIDS;
+ int process = 1;
+ int num = 0;
+
+ //sanity check
+ if (groups.size() < processors) { processors = groups.size(); }
+
+ //divide the groups between the processors
+ vector<linePair> lines;
+ int numGroupsPerProcessor = groups.size() / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numGroupsPerProcessor;
+ int endIndex = (i+1) * numGroupsPerProcessor;
+ if(i == (processors - 1)){ endIndex = groups.size(); }
+ lines.push_back(linePair(startIndex, endIndex));
+ }
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ num = driverGroups(parser, outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups);
+
+ //pass numSeqs to parent
+ ofstream out;
+ string tempFile = outputFName + toString(getpid()) + ".num.temp";
+ m->openOutputFile(tempFile, out);
+ out << num << endl;
+ out.close();
+
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
+ }
+ }
+
+ //do my part
+ num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups);
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processIDS.size();i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ for (int i = 0; i < processIDS.size(); i++) {
+ ifstream in;
+ string tempFile = outputFName + toString(processIDS[i]) + ".num.temp";
+ m->openInputFile(tempFile, in);
+ if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
+ in.close(); m->mothurRemove(tempFile);
+ }
+
+#else
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the preClusterData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<perseusData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=1; i<processors; i++ ){
+ // Allocate memory for thread data.
+ string extension = toString(i) + ".temp";
+
+ perseusData* tempPerseus = new perseusData(alpha, beta, cutoff, outputFName+extension, fasta, name, group, accnos+extension, groups, m, lines[i].start, lines[i].end, i);
+
+ pDataArray.push_back(tempPerseus);
+ processIDS.push_back(i);
+
+ //MyPerseusThreadFunction is in header. It must be global or static to work with the threads.
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i-1] = CreateThread(NULL, 0, MyPerseusThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
+ }
+
+
+ //using the main process as a worker saves time and memory
+ num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+ cout << "here" << endl;
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ num += pDataArray[i]->count;
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+#endif
+
+
+ //append output files
+ for(int i=0;i<processIDS.size();i++){
+ m->appendFiles((outputFName + toString(processIDS[i]) + ".temp"), outputFName);
+ m->mothurRemove((outputFName + toString(processIDS[i]) + ".temp"));
+
+ m->appendFiles((accnos + toString(processIDS[i]) + ".temp"), accnos);
+ m->mothurRemove((accnos + toString(processIDS[i]) + ".temp"));
+ }
+
+ return num;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "createProcessesGroups");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int ChimeraPerseusCommand::deconvoluteResults(SequenceParser& parser, string outputFileName, string accnosFileName){
+ try {
+ map<string, string> uniqueNames = parser.getAllSeqsMap();
+ map<string, string>::iterator itUnique;
+ int total = 0;
+
+ //edit accnos file
+ ifstream in2;
+ m->openInputFile(accnosFileName, in2);
+
+ ofstream out2;
+ m->openOutputFile(accnosFileName+".temp", out2);
+
+ string name;
+ set<string> namesInFile; //this is so if a sequence is found to be chimera in several samples we dont write it to the results file more than once
+ set<string>::iterator itNames;
+ set<string> chimerasInFile;
+ set<string>::iterator itChimeras;
+
+
+ while (!in2.eof()) {
+ if (m->control_pressed) { in2.close(); out2.close(); m->mothurRemove(outputFileName); m->mothurRemove((accnosFileName+".temp")); return 0; }
+
+ in2 >> name; m->gobble(in2);
+
+ //find unique name
+ itUnique = uniqueNames.find(name);
+
+ if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+ else {
+ itChimeras = chimerasInFile.find((itUnique->second));
+
+ if (itChimeras == chimerasInFile.end()) {
+ out2 << itUnique->second << endl;
+ chimerasInFile.insert((itUnique->second));
+ total++;
+ }
+ }
+ }
+ in2.close();
+ out2.close();
+
+ m->mothurRemove(accnosFileName);
+ rename((accnosFileName+".temp").c_str(), accnosFileName.c_str());
+
+ //edit chimera file
+ ifstream in;
+ m->openInputFile(outputFileName, in);
+
+ ofstream out;
+ m->openOutputFile(outputFileName+".temp", out); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
+
+ int DiffsToBestMatch, BestMatchIndex, DiffstToChimera, IndexofLeftParent, IndexOfRightParent;
+ float temp1,temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+ string index, BestMatchName, parent1, parent2, flag;
+ name = "";
+ namesInFile.clear();
+ //assumptions - in file each read will always look like
+ /*
+ SequenceIndex Name DiffsToBestMatch BestMatchIndex BestMatchName DiffstToChimera IndexofLeftParent IndexOfRightParent NameOfLeftParent NameOfRightParent DistanceToBestMatch cIndex (cIndex - singleDist) loonIndex MismatchesToChimera MismatchToTrimera ChimeraBreakPoint LogisticProbability TypeOfSequence
+ 0 F01QG4L02JVBQY 0 0 Null 0 0 0 Null Null 0.0 0.0 0.0 0.0 0 0 0 0.0 0.0 good
+ 1 F01QG4L02ICTC6 0 0 Null 0 0 0 Null Null 0.0 0.0 0.0 0.0 0 0 0 0.0 0.0 good
+ 2 F01QG4L02JZOEC 48 0 F01QG4L02JVBQY 47 0 0 F01QG4L02JVBQY F01QG4L02JVBQY 2.0449 2.03545 -0.00944493 0 47 2147483647 138 0 good
+ 3 F01QG4L02G7JEC 42 0 F01QG4L02JVBQY 40 1 0 F01QG4L02ICTC6 F01QG4L02JVBQY 1.87477 1.81113 -0.0636404 5.80145 40 2147483647 25 0 good
+ */
+
+ //get and print headers
+ BestMatchName = m->getline(in); m->gobble(in);
+ out << BestMatchName << endl;
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove((outputFileName+".temp")); return 0; }
+
+ bool print = false;
+ in >> index; m->gobble(in);
+
+ if (index != "SequenceIndex") { //if you are not a header line, there will be a header line for each group if group file is given
+ in >> name; m->gobble(in);
+ in >> DiffsToBestMatch; m->gobble(in);
+ in >> BestMatchIndex; m->gobble(in);
+ in >> BestMatchName; m->gobble(in);
+ in >> DiffstToChimera; m->gobble(in);
+ in >> IndexofLeftParent; m->gobble(in);
+ in >> IndexOfRightParent; m->gobble(in);
+ in >> parent1; m->gobble(in);
+ in >> parent2; m->gobble(in);
+ in >> temp1 >> temp2 >> temp3 >> temp4 >> temp5 >> temp6 >> temp7 >> temp8 >> flag; m->gobble(in);
+
+ //find unique name
+ itUnique = uniqueNames.find(name);
+
+ if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+ else {
+ name = itUnique->second;
+ //is this name already in the file
+ itNames = namesInFile.find((name));
+
+ if (itNames == namesInFile.end()) { //no not in file
+ if (flag == "good") { //are you really a no??
+ //is this sequence really not chimeric??
+ itChimeras = chimerasInFile.find(name);
+
+ //then you really are a no so print, otherwise skip
+ if (itChimeras == chimerasInFile.end()) { print = true; }
+ }else{ print = true; }
+ }
+ }
+
+ if (print) {
+ out << index << '\t' << name << '\t' << DiffsToBestMatch << '\t' << BestMatchIndex << '\t';
+ namesInFile.insert(name);
+
+ if (BestMatchName != "Null") {
+ itUnique = uniqueNames.find(BestMatchName);
+ if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find BestMatchName "+ BestMatchName + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+ else { out << itUnique->second << '\t'; }
+ }else { out << "Null" << '\t'; }
+
+ out << DiffstToChimera << '\t' << IndexofLeftParent << '\t' << IndexOfRightParent << '\t';
+
+ if (parent1 != "Null") {
+ itUnique = uniqueNames.find(parent1);
+ if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find parent1 "+ parent1 + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+ else { out << itUnique->second << '\t'; }
+ }else { out << "Null" << '\t'; }
+
+ if (parent1 != "Null") {
+ itUnique = uniqueNames.find(parent2);
+ if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find parent2 "+ parent2 + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+ else { out << itUnique->second << '\t'; }
+ }else { out << "Null" << '\t'; }
+
+ out << temp1 << '\t' << temp2 << '\t' << temp3 << '\t' << temp4 << '\t' << temp5 << '\t' << temp6 << '\t' << temp7 << '\t' << temp8 << '\t' << flag << endl;
+ }
+ }else { index = m->getline(in); m->gobble(in); }
+ }
+ in.close();
+ out.close();
+
+ m->mothurRemove(outputFileName);
+ rename((outputFileName+".temp").c_str(), outputFileName.c_str());
+
+ return total;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraPerseusCommand", "deconvoluteResults");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+
--- /dev/null
+#ifndef CHIMERAPERSEUSCOMMAND_H
+#define CHIMERAPERSEUSCOMMAND_H
+
+
+/*
+ * chimeraperseuscommand.h
+ * Mothur
+ *
+ * Created by westcott on 10/26/11.
+ * Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+
+#include "mothur.h"
+#include "command.hpp"
+#include "sequenceparser.h"
+#include "myPerseus.h"
+
+/***********************************************************/
+class ChimeraPerseusCommand : public Command {
+public:
+ ChimeraPerseusCommand(string);
+ ChimeraPerseusCommand();
+ ~ChimeraPerseusCommand() {}
+
+ vector<string> setParameters();
+ string getCommandName() { return "chimera.perseus"; }
+ string getCommandCategory() { return "Sequence Processing"; }
+ string getHelpString();
+ string getCitation() { return "http://www.mothur.org/wiki/Chimera.perseus\n"; }
+ string getDescription() { return "detect chimeric sequences"; }
+
+ int execute();
+ void help() { m->mothurOut(getHelpString()); }
+
+private:
+ struct linePair {
+ int start;
+ int end;
+ linePair(int i, int j) : start(i), end(j) {}
+ };
+
+ bool abort;
+ string fastafile, groupfile, outputDir, namefile;
+ int processors;
+ double cutoff, alpha, beta;
+
+ vector<string> outputNames;
+ vector<string> fastaFileNames;
+ vector<string> nameFileNames;
+ vector<string> groupFileNames;
+
+ string getNamesFile(string&);
+ int driver(string, vector<seqData>&, string, int&);
+ vector<seqData> readFiles(string, string);
+ vector<seqData> loadSequences(SequenceParser&, string);
+ int deconvoluteResults(SequenceParser&, string, string);
+ int driverGroups(SequenceParser&, string, string, int, int, vector<string>);
+ int createProcessesGroups(SequenceParser&, string, string, vector<string>, string, string, string);
+};
+
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct perseusData {
+ string fastafile;
+ string namefile;
+ string groupfile;
+ string outputFName;
+ string accnos;
+ MothurOut* m;
+ int start;
+ int end;
+ int threadID, count, numChimeras;
+ double alpha, beta, cutoff;
+ vector<string> groups;
+
+ perseusData(){}
+ perseusData(double a, double b, double c, string o, string f, string n, string g, string ac, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
+ alpha = a;
+ beta = b;
+ cutoff = c;
+ fastafile = f;
+ namefile = n;
+ groupfile = g;
+ outputFName = o;
+ accnos = ac;
+ m = mout;
+ start = st;
+ end = en;
+ threadID = tid;
+ groups = gr;
+ count = 0;
+ numChimeras = 0;
+ }
+};
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
+ perseusData* pDataArray;
+ pDataArray = (perseusData*)lpParam;
+
+ try {
+
+ //clears files
+ ofstream out, out1, out2;
+ pDataArray->m->openOutputFile(pDataArray->outputFName, out); out.close();
+ pDataArray->m->openOutputFile(pDataArray->accnos, out1); out1.close();
+
+ //parse fasta and name file by group
+ SequenceParser* parser;
+ if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); }
+ else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); }
+
+ int totalSeqs = 0;
+ int numChimeras = 0;
+
+ for (int i = pDataArray->start; i < pDataArray->end; i++) {
+
+ int start = time(NULL); if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+
+ pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[i] + "..."); pDataArray->m->mothurOutEndLine();
+
+ //vector<seqData> sequences = loadSequences(parser, groups[i]); - same function below
+ ////////////////////////////////////////////////////////////////////////////////////////
+ vector<Sequence> thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]);
+ map<string, string> nameMap = parser->getNameMap(pDataArray->groups[i]);
+ map<string, string>::iterator it;
+
+ vector<seqData> sequences;
+ bool error = false;
+
+ for (int j = 0; j < thisGroupsSeqs.size(); j++) {
+
+ if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+
+ it = nameMap.find(thisGroupsSeqs[j].getName());
+ if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[j].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+ else {
+ int num = pDataArray->m->getNumNames(it->second);
+ sequences.push_back(seqData(thisGroupsSeqs[j].getName(), thisGroupsSeqs[j].getUnaligned(), num));
+ }
+ }
+
+ if (error) { pDataArray->m->control_pressed = true; }
+
+ //sort by frequency
+ sort(sequences.rbegin(), sequences.rend());
+ ////////////////////////////////////////////////////////////////////////////////////////
+
+ if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+
+ //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below
+ ////////////////////////////////////////////////////////////////////////////////////////
+ string chimeraFileName = pDataArray->outputFName+pDataArray->groups[i];
+ string accnosFileName = pDataArray->accnos+pDataArray->groups[i];
+
+ vector<vector<double> > correctModel(4); //could be an option in the future to input own model matrix
+ for(int j=0;j<4;j++){ correctModel[j].resize(4); }
+
+ correctModel[0][0] = 0.000000; //AA
+ correctModel[1][0] = 11.619259; //CA
+ correctModel[2][0] = 11.694004; //TA
+ correctModel[3][0] = 7.748623; //GA
+
+ correctModel[1][1] = 0.000000; //CC
+ correctModel[2][1] = 7.619657; //TC
+ correctModel[3][1] = 12.852562; //GC
+
+ correctModel[2][2] = 0.000000; //TT
+ correctModel[3][2] = 10.964048; //TG
+
+ correctModel[3][3] = 0.000000; //GG
+
+ for(int k=0;k<4;k++){
+ for(int j=0;j<k;j++){
+ correctModel[j][k] = correctModel[k][j];
+ }
+ }
+
+ int numSeqs = sequences.size();
+ int alignLength = sequences[0].sequence.size();
+
+ ofstream chimeraFile;
+ ofstream accnosFile;
+ pDataArray->m->openOutputFile(chimeraFileName, chimeraFile);
+ pDataArray->m->openOutputFile(accnosFileName, accnosFile);
+
+ Perseus myPerseus;
+ vector<vector<double> > binMatrix = myPerseus.binomial(alignLength);
+
+ chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n";
+
+ vector<bool> chimeras(numSeqs, 0);
+
+ for(int j=0;j<numSeqs;j++){
+
+ if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+
+ vector<bool> restricted = chimeras;
+
+ vector<vector<int> > leftDiffs(numSeqs);
+ vector<vector<int> > leftMaps(numSeqs);
+ vector<vector<int> > rightDiffs(numSeqs);
+ vector<vector<int> > rightMaps(numSeqs);
+
+ vector<int> singleLeft, bestLeft;
+ vector<int> singleRight, bestRight;
+
+ int bestSingleIndex, bestSingleDiff;
+ vector<pwAlign> alignments(numSeqs);
+
+ int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
+
+ if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+
+ int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
+
+ string dummyA, dummyB;
+
+ if(comparisons >= 2){
+ minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
+
+ if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+
+ int minMismatchToTrimera = numeric_limits<int>::max();
+ int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
+
+ if(minMismatchToChimera >= 3 && comparisons >= 3){
+ minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
+
+ if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+ }
+
+ double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
+
+ if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+
+ string type;
+ string chimeraRefSeq;
+
+ if(minMismatchToChimera - minMismatchToTrimera >= 3){
+ type = "trimera";
+ chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps);
+ }
+ else{
+ type = "chimera";
+ chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
+ }
+
+ if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+
+ double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
+
+ if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+
+ double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq);
+ double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);
+
+ if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+
+ chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
+ chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t';
+ chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t';
+ chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t';
+
+ double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, pDataArray->alpha, pDataArray->beta);
+
+ chimeraFile << probability << '\t';
+
+ if(probability > pDataArray->cutoff){
+ chimeraFile << type << endl;
+ accnosFile << sequences[j].seqName << endl;
+ chimeras[j] = 1;
+ numChimeras++;
+ }
+ else{
+ chimeraFile << "good" << endl;
+ }
+
+ }
+ else{
+ chimeraFile << j << '\t' << sequences[j].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
+ }
+ //report progress
+ if((j+1) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(j+1) + "\n"); }
+ }
+
+ if((numSeqs) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n"); }
+
+ chimeraFile.close();
+ accnosFile.close();
+ ////////////////////////////////////////////////////////////////////////////////////////
+
+ totalSeqs += numSeqs;
+
+ //append files
+ pDataArray->m->appendFiles(chimeraFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(chimeraFileName);
+ pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName);
+ pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[i] + "."); pDataArray->m->mothurOutEndLine();
+
+ if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+ }
+
+ pDataArray->count = totalSeqs;
+ delete parser;
+ return totalSeqs;
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "ChimeraUchimeCommand", "MyPerseusThreadFunction");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+#endif
+
+#endif
+
+
#else
//////////////////////////////////////////////////////////////////////////////////////////////////////
- //Windows version shared memory, so be careful when passing variables through the preClusterData struct.
+ //Windows version shared memory, so be careful when passing variables through the uchimeData struct.
//Above fork() will clone, so memory is separate, but that's not the case with windows,
//////////////////////////////////////////////////////////////////////////////////////////////////////
pDataArray.push_back(tempUchime);
processIDS.push_back(i);
- //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+ //MyUchimeThreadFunction is in header. It must be global or static to work with the threads.
//default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
hThreadArray[i-1] = CreateThread(NULL, 0, MyUchimeThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
}
m->gobble(in);
//are there confidence scores, if so remove them
- if (tax.find_first_of('(') != -1) { removeConfidences(tax); }
+ if (tax.find_first_of('(') != -1) { m->removeConfidences(tax); }
taxMap[name] = tax;
out << (i+1) << '\t' << size << '\t' << conTax << endl;
string noConfidenceConTax = conTax;
- removeConfidences(noConfidenceConTax);
+ m->removeConfidences(noConfidenceConTax);
//add this bins taxonomy to summary
if (basis == "sequence") {
exit(1);
}
}
-
-/**************************************************************************************************/
-void ClassifyOtuCommand::removeConfidences(string& tax) {
- try {
-
- string taxon;
- string newTax = "";
-
- while (tax.find_first_of(';') != -1) {
- //get taxon
- taxon = tax.substr(0,tax.find_first_of(';'));
-
- int pos = taxon.find_first_of('(');
- if (pos != -1) {
- taxon = taxon.substr(0, pos); //rip off confidence
- }
-
- taxon += ";";
-
- tax = tax.substr(tax.find_first_of(';')+1, tax.length());
- newTax += taxon;
- }
-
- tax = newTax;
- }
- catch(exception& e) {
- m->errorOut(e, "ClassifyOtuCommand", "removeConfidences");
- exit(1);
- }
-}
//**********************************************************************************************************************
int readNamesFile();
int readTaxonomyFile();
- void removeConfidences(string&);
int process(ListVector*);
string addUnclassifieds(string, int);
vector<string> findConsensusTaxonomy(int, ListVector*, int&, string&); // returns the name of the "representative" taxonomy of given bin
#include "countgroupscommand.h"
#include "clearmemorycommand.h"
#include "summarytaxcommand.h"
+#include "chimeraperseuscommand.h"
/*******************************************************/
commands["chimera.check"] = "MPIEnabled";
commands["chimera.slayer"] = "MPIEnabled";
commands["chimera.uchime"] = "chimera.uchime";
+ commands["chimera.perseus"] = "chimera.perseus";
commands["chimera.pintail"] = "MPIEnabled";
commands["chimera.bellerophon"] = "MPIEnabled";
commands["screen.seqs"] = "MPIEnabled";
commands["sens.spec"] = "sens.spec";
commands["seq.error"] = "seq.error";
commands["seq.error"] = "summary.tax";
+
commands["quit"] = "MPIEnabled";
}
else if(commandName == "count.groups") { command = new CountGroupsCommand(optionString); }
else if(commandName == "clear.memory") { command = new ClearMemoryCommand(optionString); }
else if(commandName == "summary.tax") { command = new SummaryTaxCommand(optionString); }
+ else if(commandName == "chimera.perseus") { command = new ChimeraPerseusCommand(optionString); }
else { command = new NoCommand(optionString); }
return command;
else if(commandName == "count.groups") { pipecommand = new CountGroupsCommand(optionString); }
else if(commandName == "clear.memory") { pipecommand = new ClearMemoryCommand(optionString); }
else if(commandName == "summary.tax") { pipecommand = new SummaryTaxCommand(optionString); }
+ else if(commandName == "chimera.perseus") { pipecommand = new ChimeraPerseusCommand(optionString); }
else { pipecommand = new NoCommand(optionString); }
return pipecommand;
else if(commandName == "count.groups") { shellcommand = new CountGroupsCommand(); }
else if(commandName == "clear.memory") { shellcommand = new ClearMemoryCommand(); }
else if(commandName == "summary.tax") { shellcommand = new SummaryTaxCommand(); }
+ else if(commandName == "chimera.perseus") { shellcommand = new ChimeraPerseusCommand(); }
else { shellcommand = new NoCommand(); }
return shellcommand;
if (hasConPos != string::npos) {
taxonsHasConfidence[i] = true;
searchTaxons[i] = getTaxons(listOfTaxons[i]);
- noConfidenceTaxons[i] = removeConfidences(listOfTaxons[i]);
+ noConfidenceTaxons[i] = listOfTaxons[i];
+ m->removeConfidences(noConfidenceTaxons[i]);
}
}
if (!taxonsHasConfidence[j]) {
int hasConfidences = tax.find_first_of('(');
if (hasConfidences != string::npos) {
- newtax = removeConfidences(tax);
+ newtax = tax;
+ m->removeConfidences(newtax);
}
int pos = newtax.find(noConfidenceTaxons[j]);
string noNewTax = tax;
int hasConfidences = tax.find_first_of('(');
if (hasConfidences != string::npos) {
- noNewTax = removeConfidences(tax);
+ noNewTax = tax;
+ m->removeConfidences(noNewTax);
}
int pos = noNewTax.find(noConfidenceTaxons[j]);
exit(1);
}
}
-/**************************************************************************************************/
-string GetLineageCommand::removeConfidences(string tax) {
- try {
-
- string taxon = "";
- int taxLength = tax.length();
- for(int i=0;i<taxLength;i++){
- if(tax[i] == ';'){
- taxon = taxon.substr(0, taxon.find_first_of('(')); //rip off confidence
- taxon += ";";
- }
- else{
- taxon += tax[i];
- }
- }
-
- return taxon;
- }
- catch(exception& e) {
- m->errorOut(e, "GetLineageCommand", "removeConfidences");
- exit(1);
- }
-}
//**********************************************************************************************************************
//alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name
int GetLineageCommand::readAlign(){
int readAlign();
int readList();
int readTax();
- string removeConfidences(string);
vector< map<string, float> > getTaxons(string);
};
//get set names
string setA = namesOfGroupCombos[c][0];
string setB = namesOfGroupCombos[c][1];
-
+ cout << setA << '\t' << setB << endl;
//get filename
string outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + thisLookUp[0]->getLabel() + "." + setA + "-" + setB + ".metastats";
outputNames.push_back(outputFileName); outputTypes["metastats"].push_back(outputFileName);
}
}
- //for (int i = 0; i < subset.size(); i++) { cout << subset[i]->getGroup() << endl; }
+ for (int i = 0; i < subset.size(); i++) { cout << designMap->getGroup(subset[i]->getGroup()) << endl; }
+ cout << setACount << endl;
if ((setACount == 0) || (setBCount == 0)) {
m->mothurOut("Missing shared info for " + setA + " or " + setB + ". Skipping comparison."); m->mothurOutEndLine();
}
//total for first grouping
- for (int i = 0; i < (secondGroupingStart-1); i++) { total1 += total[i]; }
+ for (int i = 0; i < secondGroupingStart; i++) { total1 += total[i]; }
//total for second grouping
- for (int i = (secondGroupingStart-1); i < column; i++) { total2 += total[i]; }
+ for (int i = secondGroupingStart; i < column; i++) { total2 += total[i]; }
//Creates the ratios by first finding the minimum of totals
double min = total[0];
if (m->control_pressed) { return 1; }
// Start the calculations.
- if ( (column == 2) || ((secondGroupingStart-1) < 8) || ((column-secondGroupingStart+1) < 8) ){
+ if ( (column == 2) || (secondGroupingStart < 8) || ((column-secondGroupingStart) < 8) ){
vector<double> fish; fish.resize(row, 0.0);
vector<double> fish2; fish2.resize(row, 0.0);
for(int i = 0; i < row; i++){
- for(int j = 0; j < (secondGroupingStart-1); j++) { fish[i] += data[i][j]; }
- for(int j = (secondGroupingStart-1); j < column; j++) { fish2[i] += data[i][j]; }
+ for(int j = 0; j < secondGroupingStart; j++) { fish[i] += data[i][j]; }
+ for(int j = secondGroupingStart; j < column; j++) { fish2[i] += data[i][j]; }
//vector<double> tempData; tempData.resize(4, 0.0);
double f11, f12, f21, f22;
for(int i = 0; i < row; i++){
- for(int j = 0; j < (secondGroupingStart-1); j++){ sparse[i] += data[i][j]; }
- if(sparse[i] < (double)(secondGroupingStart-1)){ c++; }
+ for(int j = 0; j < secondGroupingStart; j++) { sparse[i] += data[i][j]; }
+ if(sparse[i] < (double)secondGroupingStart) { c++; }
// ?<= for col
- for(int j = (secondGroupingStart-1); j < column; j++){ sparse2[i] += data[i][j]; }
- if( (sparse2[i] < (double)(column-secondGroupingStart+1))) { c++; }
+ for(int j = secondGroupingStart; j < column; j++) { sparse2[i] += data[i][j]; }
+ if( (sparse2[i] < (double)(column-secondGroupingStart))) { c++; }
if (c == 2) {
c=0;
for (int j = 0; j < row; j++){
if (m->control_pressed) { return 1; }
- for (int i = 1; i <= (secondGroupingStart-1); i++){ temp[j][0] += data[j][i-1]; }
- temp[j][0] /= (double)(secondGroupingStart-1);
+ for (int i = 0; i < secondGroupingStart; i++){ temp[j][0] += data[j][i]; }
+ temp[j][0] /= (double)secondGroupingStart;
- for(int i = secondGroupingStart; i <= column; i++){ temp[j][1] += data[j][i-1]; }
- temp[j][1] /= (double)(column-secondGroupingStart+1);
+ for(int i = secondGroupingStart; i < column; i++){ temp[j][1] += data[j][i]; }
+ temp[j][1] /= (double)(column-secondGroupingStart);
}
for(int i = 0; i < row; i++){
storage[i][0]=C1[i][0];
C1[i][1]=tool[i+row+row]; // var group 1
storage[i][1]=C1[i][1];
- C1[i][2]=C1[i][1]/(secondGroupingStart-1);
+ C1[i][2]=C1[i][1]/(secondGroupingStart);
storage[i][2]=sqrt(C1[i][2]);
C2[i][0]=tool[i+row]; // mean group 2
storage[i][4]=C2[i][0];
C2[i][1]=tool[i+row+row+row]; // var group 2
storage[i][5]=C2[i][1];
- C2[i][2]=C2[i][1]/(column-secondGroupingStart+1);
+ C2[i][2]=C2[i][1]/(column-secondGroupingStart);
storage[i][6]=sqrt(C2[i][2]);
}
vector<double> var; var.resize(row, 0.0);
vector<double> var2; var2.resize(row, 0.0);
- double a = secondGroupingStart-1;
+ double a = secondGroupingStart;
double b = column - a;
int m = a * row;
int n = row * column;
if (m->control_pressed) { return 0; }
C1[i][0]=tool[i];
C1[i][1]=tool[i+row+row];
- C1[i][2]=C1[i][1]/(secondGroupingStart-1);
+ C1[i][2]=C1[i][1]/(secondGroupingStart);
C2[i][0]=tool[i+row];
C2[i][1]=tool[i+row+row+row]; // var group 2
- C2[i][2]=C2[i][1]/(column-secondGroupingStart+1);
+ C2[i][2]=C2[i][1]/(column-secondGroupingStart);
}
for (int i = 0; i < row; i++){
}
}
/**************************************************************************************************/
+int MothurOut::removeConfidences(string& tax) {
+ try {
+
+ string taxon;
+ string newTax = "";
+
+ while (tax.find_first_of(';') != -1) {
+
+ if (control_pressed) { return 0; }
+
+ //get taxon
+ taxon = tax.substr(0,tax.find_first_of(';'));
+
+ int pos = taxon.find_last_of('(');
+ if (pos != -1) {
+ //is it a number?
+ int pos2 = taxon.find_last_of(')');
+ if (pos2 != -1) {
+ string confidenceScore = taxon.substr(pos+1, (pos2-(pos+1)));
+ if (isContainingOnlyDigits(confidenceScore)) {
+ taxon = taxon.substr(0, pos); //rip off confidence
+ }
+ }
+ }
+ taxon += ";";
+
+ tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+ newTax += taxon;
+ }
+
+ tax = newTax;
+
+ return 0;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "removeConfidences");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
int readNames(string, map<string, vector<string> >&);
int readNames(string, vector<seqPriorityNode>&, map<string, string>&);
void mothurRemove(string);
+
//searchs and checks
bool checkReleaseVersion(ifstream&, string);
void splitAtDash(string&, set<string>&);
void splitAtDash(string&, vector<string>&);
void splitAtChar(string&, vector<string>&, char);
+ int removeConfidences(string&);
//math operation
int factorial(int num);
--- /dev/null
+/*
+ * myPerseus.cpp
+ *
+ *
+ * Created by Pat Schloss on 9/5/11.
+ * Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "myPerseus.h"
+
+/**************************************************************************************************/
+int PERSEUSMAXINT = numeric_limits<int>::max();
+/**************************************************************************************************/
+
+vector<vector<double> > Perseus::binomial(int maxOrder){
+ try {
+ vector<vector<double> > binomial(maxOrder+1);
+
+ for(int i=0;i<=maxOrder;i++){
+ binomial[i].resize(maxOrder+1);
+ binomial[i][0]=1;
+ binomial[0][i]=0;
+ }
+ binomial[0][0]=1;
+
+ binomial[1][0]=1;
+ binomial[1][1]=1;
+
+ for(int i=2;i<=maxOrder;i++){
+ binomial[1][i]=0;
+ }
+
+ for(int i=2;i<=maxOrder;i++){
+ for(int j=1;j<=maxOrder;j++){
+ if(i==j){ binomial[i][j]=1; }
+ if(j>i) { binomial[i][j]=0; }
+ else { binomial[i][j]=binomial[i-1][j-1]+binomial[i-1][j]; }
+ }
+ }
+
+ return binomial;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "binomial");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+double Perseus::basicPairwiseAlignSeqs(string query, string reference, string& qAlign, string& rAlign, pwModel model){
+ try {
+ double GAP = model.GAP_OPEN;
+ double MATCH = model.MATCH;
+ double MISMATCH = model.MISMATCH;
+
+ int queryLength = query.size();
+ int refLength = reference.size();
+
+ vector<vector<double> > alignMatrix(queryLength + 1);
+ vector<vector<char> > alignMoves(queryLength + 1);
+
+ for(int i=0;i<=queryLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[i].resize(refLength + 1, 0);
+ alignMoves[i].resize(refLength + 1, 'x');
+ }
+
+ for(int i=0;i<=queryLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[i][0] = GAP * i;
+ alignMoves[i][0] = 'u';
+ }
+
+ for(int i=0;i<=refLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[0][i] = GAP * i;
+ alignMoves[0][i] = 'l';
+ }
+
+ for(int i=1;i<=queryLength;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ for(int j=1;j<=refLength;j++){
+
+ double nogapScore;
+ if(query[i-1] == reference[j-1]){ nogapScore = alignMatrix[i-1][j-1] + MATCH; }
+ else { nogapScore = alignMatrix[i-1][j-1] + MISMATCH; }
+
+ double leftScore;
+ if(i == queryLength) { leftScore = alignMatrix[i][j-1]; }
+ else { leftScore = alignMatrix[i][j-1] + GAP; }
+
+
+ double upScore;
+ if(j == refLength) { upScore = alignMatrix[i-1][j]; }
+ else { upScore = alignMatrix[i-1][j] + GAP; }
+
+ if(nogapScore > leftScore){
+ if(nogapScore > upScore){
+ alignMoves[i][j] = 'd';
+ alignMatrix[i][j] = nogapScore;
+ }
+ else{
+ alignMoves[i][j] = 'u';
+ alignMatrix[i][j] = upScore;
+ }
+ }
+ else{
+ if(leftScore > upScore){
+ alignMoves[i][j] = 'l';
+ alignMatrix[i][j] = leftScore;
+ }
+ else{
+ alignMoves[i][j] = 'u';
+ alignMatrix[i][j] = upScore;
+ }
+ }
+ }
+ }
+
+ int i = queryLength;
+ int j = refLength;
+
+ qAlign = "";
+ rAlign = "";
+
+ int diffs = 0;
+ int length = 0;
+
+ while(i > 0 && j > 0){
+
+ if (m->control_pressed) { return 0; }
+
+ if(alignMoves[i][j] == 'd'){
+ qAlign = query[i-1] + qAlign;
+ rAlign = reference[j-1] + rAlign;
+
+ if(query[i-1] != reference[j-1]){ diffs++; }
+ length++;
+
+ i--;
+ j--;
+ }
+ else if(alignMoves[i][j] == 'u'){
+ qAlign = query[i-1] + qAlign;
+
+ if(j != refLength) { rAlign = '-' + rAlign; diffs++; length++; }
+ else { rAlign = '.' + rAlign; }
+ i--;
+ }
+ else if(alignMoves[i][j] == 'l'){
+ rAlign = reference[j-1] + rAlign;
+
+ if(i != queryLength){ qAlign = '-' + qAlign; diffs++; length++; }
+ else { qAlign = '.' + qAlign; }
+ j--;
+ }
+ }
+
+ while(i>0){
+
+ if (m->control_pressed) { return 0; }
+
+ rAlign = '.' + rAlign;
+ qAlign = query[i-1] + qAlign;
+ i--;
+ }
+
+ while(j>0){
+
+ if (m->control_pressed) { return 0; }
+
+ rAlign = reference[j-1] + rAlign;
+ qAlign = '.' + qAlign;
+ j--;
+ }
+
+
+
+ return double(diffs)/double(length);
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "basicPairwiseAlignSeqs");
+ exit(1);
+ }
+
+}
+/**************************************************************************************************/
+int Perseus::getDiffs(string qAlign, string rAlign, vector<int>& leftDiffs, vector<int>& leftMap, vector<int>& rightDiffs, vector<int>& rightMap){
+ try {
+ int alignLength = qAlign.length();
+
+ int lDiffs = 0;
+ int lCount = 0;
+ for(int l=0;l<alignLength;l++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(qAlign[l] == '-'){
+ lDiffs++;
+ }
+ else if(qAlign[l] != '.'){
+
+ if(rAlign[l] == '-'){
+ lDiffs++;
+ }
+ else if(qAlign[l] != rAlign[l] && rAlign[l] != '.'){
+ lDiffs++;
+ }
+ leftDiffs[lCount] = lDiffs;
+ leftMap[lCount] = l;
+
+ lCount++;
+ }
+ }
+
+ int rDiffs = 0;
+ int rCount = 0;
+ for(int l=alignLength-1;l>=0;l--){
+
+ if (m->control_pressed) { return 0; }
+
+ if(qAlign[l] == '-'){
+ rDiffs++;
+ }
+ else if(qAlign[l] != '.'){
+
+ if(rAlign[l] == '-'){
+ rDiffs++;
+ }
+ else if(qAlign[l] != rAlign[l] && rAlign[l] != '.'){
+ rDiffs++;
+ }
+
+ rightDiffs[rCount] = rDiffs;
+ rightMap[rCount] = l;
+ rCount++;
+
+ }
+
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "getDiffs");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int Perseus::getLastMatch(char direction, vector<vector<char> >& alignMoves, int i, int j, string& seqA, string& seqB){
+ try {
+ char nullReturn = -1;
+
+ while(i>=1 && j>=1){
+
+ if (m->control_pressed) { return 0; }
+
+ if(direction == 'd'){
+ if(seqA[i-1] == seqB[j-1]) { return seqA[i-1]; }
+ else { return nullReturn; }
+ }
+
+ else if(direction == 'l') { j--; }
+ else { i--; }
+
+ direction = alignMoves[i][j];
+ }
+
+ return nullReturn;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "getLastMatch");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int Perseus::toInt(char b){
+ try {
+ if(b == 'A') { return 0; }
+ else if(b == 'C') { return 1; }
+ else if(b == 'T') { return 2; }
+ else if(b == 'G') { return 3; }
+ else { m->mothurOut("[ERROR]: " + toString(b) + " is not ATGC."); m->mothurOutEndLine(); return -1; }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "toInt");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double Perseus::modeledPairwiseAlignSeqs(string query, string reference, string& qAlign, string& rAlign, vector<vector<double> >& correctMatrix){
+ try {
+ int queryLength = query.size();
+ int refLength = reference.size();
+
+ vector<vector<double> > alignMatrix(queryLength + 1);
+ vector<vector<char> > alignMoves(queryLength + 1);
+
+ for(int i=0;i<=queryLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[i].resize(refLength + 1, 0);
+ alignMoves[i].resize(refLength + 1, 'x');
+ }
+
+ for(int i=0;i<=queryLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[i][0] = 15.0 * i;
+ alignMoves[i][0] = 'u';
+ }
+
+ for(int i=0;i<=refLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[0][i] = 15.0 * i;
+ alignMoves[0][i] = 'l';
+ }
+
+ for(int i=1;i<=queryLength;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ for(int j=1;j<=refLength;j++){
+
+ double nogap;
+ nogap = alignMatrix[i-1][j-1] + correctMatrix[toInt(query[i-1])][toInt(reference[j-1])];
+
+ double gap;
+
+ double left;
+ if(i == queryLength){ //terminal gap
+ left = alignMatrix[i][j-1];
+ }
+ else{
+ if(reference[j-1] == getLastMatch('l', alignMoves, i, j, query, reference)){
+ gap = 4.0;
+ }
+ else{
+ gap = 15.0;
+ }
+
+ left = alignMatrix[i][j-1] + gap;
+ }
+
+ double up;
+ if(j == refLength){ //terminal gap
+ up = alignMatrix[i-1][j];
+ }
+ else{
+
+ if(query[i-1] == getLastMatch('u', alignMoves, i, j, query, reference)){
+ gap = 4.0;
+ }
+ else{
+ gap = 15.0;
+ }
+
+ up = alignMatrix[i-1][j] + gap;
+ }
+
+
+ if(nogap < left){
+ if(nogap < up){
+ alignMoves[i][j] = 'd';
+ alignMatrix[i][j] = nogap;
+ }
+ else{
+ alignMoves[i][j] = 'u';
+ alignMatrix[i][j] = up;
+ }
+ }
+ else{
+ if(left < up){
+ alignMoves[i][j] = 'l';
+ alignMatrix[i][j] = left;
+ }
+ else{
+ alignMoves[i][j] = 'u';
+ alignMatrix[i][j] = up;
+ }
+ }
+ }
+ }
+
+ int i = queryLength;
+ int j = refLength;
+
+ int alignLength = 0;
+
+ while(i > 0 && j > 0){
+
+ if (m->control_pressed) { return 0; }
+
+ if(alignMoves[i][j] == 'd'){
+ qAlign = query[i-1] + qAlign;
+ rAlign = reference[j-1] + rAlign;
+ alignLength++;
+ i--;
+ j--;
+ }
+ else if(alignMoves[i][j] == 'u'){
+ if(j != refLength){
+ qAlign = query[i-1] + qAlign;
+ rAlign = '-' + rAlign;
+ alignLength++;
+ }
+
+ i--;
+ }
+ else if(alignMoves[i][j] == 'l'){
+ if(i != queryLength){
+ qAlign = '-' + qAlign;
+ rAlign = reference[j-1] + rAlign;
+ alignLength++;
+ }
+
+ j--;
+ }
+ }
+
+ return alignMatrix[queryLength][refLength] / (double)alignLength;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "modeledPairwiseAlignSeqs");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+int Perseus::getAlignments(int curSequenceIndex, vector<seqData> sequences, vector<pwAlign>& alignments, vector<vector<int> >& leftDiffs, vector<vector<int> >& leftMaps, vector<vector<int> >& rightDiffs, vector<vector<int> >& rightMaps, int& bestRefSeq, int& bestRefDiff, vector<bool>& restricted){
+ try {
+ int numSeqs = sequences.size();
+ //int bestSequenceMismatch = PERSEUSMAXINT;
+
+ string curSequence = sequences[curSequenceIndex].sequence;
+ int curFrequency = sequences[curSequenceIndex].frequency;
+
+ bestRefSeq = -1;
+
+ int bestIndex = -1;
+ int bestDiffs = PERSEUSMAXINT;
+ int comparisons = 0;
+
+ pwModel model(0, -1, -1.5);
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(i != curSequenceIndex && restricted[i] != 1 && sequences[i].frequency >= 2 * curFrequency){
+ string refSequence = sequences[i].sequence;
+
+ leftDiffs[i].assign(curSequence.length(), 0);
+ leftMaps[i].assign(curSequence.length(), 0);
+ rightDiffs[i].assign(curSequence.length(), 0);
+ rightMaps[i].assign(curSequence.length(), 0);
+
+ basicPairwiseAlignSeqs(curSequence, refSequence, alignments[i].query, alignments[i].reference, model);
+
+
+ getDiffs(alignments[i].query, alignments[i].reference, leftDiffs[i], leftMaps[i], rightDiffs[i], rightMaps[i]);
+
+ int diffs = rightDiffs[i][curSequence.length()-1];
+
+ if(diffs < bestDiffs){
+ bestDiffs = diffs;
+ bestIndex = i;
+ }
+ comparisons++;
+ restricted[i] = 0;
+ }
+ else{
+ restricted[i] = 1;
+ }
+ }
+
+ bestRefSeq = bestIndex;
+ bestRefDiff = bestDiffs;
+
+ return comparisons;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "getAlignments");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int Perseus::getChimera(vector<seqData> sequences,
+ vector<vector<int> >& leftDiffs,
+ vector<vector<int> >& rightDiffs,
+ int& leftParent,
+ int& rightParent,
+ int& breakPoint,
+ vector<int>& singleLeft,
+ vector<int>& bestLeft,
+ vector<int>& singleRight,
+ vector<int>& bestRight,
+ vector<bool> restricted){
+ try {
+ int numRefSeqs = restricted.size();
+ int seqLength = leftDiffs[0].size();
+
+ singleLeft.resize(seqLength, PERSEUSMAXINT);
+ bestLeft.resize(seqLength, -1);
+
+ for(int l=0;l<seqLength;l++){
+
+ if (m->control_pressed) { return 0; }
+
+ for(int i=0;i<numRefSeqs;i++){
+
+ if(restricted[i] == 0){
+ if(leftDiffs[i][l] < singleLeft[l] && sequences[i].frequency || (leftDiffs[i][l] == singleLeft[l] && sequences[i].frequency > sequences[bestLeft[l]].frequency)){
+ singleLeft[l] = leftDiffs[i][l];
+ bestLeft[l] = i;
+ }
+ }
+ }
+ }
+
+ singleRight.resize(seqLength, PERSEUSMAXINT);
+ bestRight.resize(seqLength, -1);
+
+ for(int l=0;l<seqLength;l++){
+
+ if (m->control_pressed) { return 0; }
+
+ for(int i=0;i<numRefSeqs;i++){
+
+ if(restricted[i] == 0){
+ if(rightDiffs[i][l] < singleRight[l] && sequences[i].frequency || (rightDiffs[i][l] == singleRight[l] && sequences[i].frequency > sequences[bestRight[l]].frequency)){
+ singleRight[l] = rightDiffs[i][l];
+ bestRight[l] = i;
+ }
+ }
+ }
+ }
+
+
+
+ int bestChimeraMismatches = PERSEUSMAXINT;
+ leftParent = -1;
+ rightParent = -1;
+ breakPoint = -1;
+
+ for(int l=0;l<seqLength;l++){
+
+ if (m->control_pressed) { return 0; }
+
+ int chimera = singleLeft[l] + singleRight[seqLength - l - 2];
+ if(chimera < bestChimeraMismatches){
+ bestChimeraMismatches = chimera;
+ breakPoint = l;
+ leftParent = bestLeft[l];
+ rightParent = bestRight[seqLength - l - 2];
+ }
+ }
+
+ return bestChimeraMismatches;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "getChimera");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+string Perseus::stitchBimera(vector<pwAlign>& alignments, int leftParent, int rightParent, int breakPoint, vector<vector<int> >& leftMaps, vector<vector<int> >& rightMaps){
+ try {
+ int breakLeft = leftMaps[leftParent][breakPoint];
+ int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2];
+
+ string left = alignments[leftParent].reference;
+ string right = alignments[rightParent].reference;
+ string chimera = "";
+
+ for(int i=0;i<=breakLeft;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(left[i] != '-' && left[i] != '.'){
+ chimera += left[i];
+ }
+ }
+
+
+ for(int i=breakRight;i<right.length();i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(right[i] != '-' && right[i] != '.'){
+ chimera += right[i];
+ }
+ }
+
+ return chimera;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "stitchBimera");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int Perseus::getTrimera(vector<seqData>& sequences,
+ vector<vector<int> >& leftDiffs,
+ int& leftParent,
+ int& middleParent,
+ int& rightParent,
+ int& breakPointA,
+ int& breakPointB,
+ vector<int>& singleLeft,
+ vector<int>& bestLeft,
+ vector<int>& singleRight,
+ vector<int>& bestRight,
+ vector<bool> restricted){
+ try {
+ int numRefSeqs = leftDiffs.size();
+ int alignLength = leftDiffs[0].size();
+ int bestTrimeraMismatches = PERSEUSMAXINT;
+
+ leftParent = -1;
+ middleParent = -1;
+ rightParent = -1;
+
+ breakPointA = -1;
+ breakPointB = -1;
+
+ vector<vector<int> > minDelta(alignLength);
+ vector<vector<int> > minDeltaSeq(alignLength);
+
+ for(int i=0;i<alignLength;i++){
+ if (m->control_pressed) { return 0; }
+ minDelta[i].assign(alignLength, PERSEUSMAXINT);
+ minDeltaSeq[i].assign(alignLength, -1);
+ }
+
+ for(int x=0;x<alignLength;x++){
+ for(int y=x;y<alignLength-1;y++){
+ for(int i=0;i<numRefSeqs;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(restricted[i] == 0){
+ int delta = leftDiffs[i][y] - leftDiffs[i][x];
+
+ if(delta < minDelta[x][y] || delta == minDelta[x][y] && sequences[i].frequency > sequences[minDeltaSeq[x][y]].frequency){
+ minDelta[x][y] = delta;
+ minDeltaSeq[x][y] = i;
+ }
+ }
+ }
+ minDelta[x][y] += singleLeft[x] + singleRight[alignLength - y - 2];
+
+ if(minDelta[x][y] < bestTrimeraMismatches){
+ bestTrimeraMismatches = minDelta[x][y];
+
+ breakPointA = x;
+ breakPointB = y;
+
+ leftParent = bestLeft[x];
+ middleParent = minDeltaSeq[x][y];
+ rightParent = bestRight[alignLength - y - 2];
+ }
+ }
+ }
+
+ return bestTrimeraMismatches;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "getTrimera");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+string Perseus::stitchTrimera(vector<pwAlign> alignments, int leftParent, int middleParent, int rightParent, int breakPointA, int breakPointB, vector<vector<int> >& leftMaps, vector<vector<int> >& rightMaps){
+ try {
+ int p1SplitPoint = leftMaps[leftParent][breakPointA];
+ int p2SplitPoint = leftMaps[middleParent][breakPointB];
+ int p3SplitPoint = rightMaps[rightParent][rightMaps[rightParent].size() - breakPointB - 2];
+
+ string chimeraRefSeq;
+ for(int i=0;i<=p1SplitPoint;i++){
+ if (m->control_pressed) { return chimeraRefSeq; }
+ if(alignments[leftParent].reference[i] != '-' && alignments[leftParent].reference[i] != '.'){
+ chimeraRefSeq += alignments[leftParent].reference[i];
+ }
+ }
+
+ for(int i=p1SplitPoint+1;i<=p2SplitPoint;i++){
+ if (m->control_pressed) { return chimeraRefSeq; }
+ if(alignments[middleParent].reference[i] != '-' && alignments[middleParent].reference[i] != '.'){
+ chimeraRefSeq += alignments[middleParent].reference[i];
+ }
+ }
+
+ for(int i=p3SplitPoint;i<alignments[rightParent].reference.length();i++){
+ if (m->control_pressed) { return chimeraRefSeq; }
+ if(alignments[rightParent].reference[i] != '-' && alignments[rightParent].reference[i] != '.'){
+ chimeraRefSeq += alignments[rightParent].reference[i];
+ }
+ }
+
+ return chimeraRefSeq;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "stitchTrimera");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int Perseus::threeWayAlign(string query, string parent1, string parent2, string& qAlign, string& aAlign, string& bAlign){
+ try {
+ pwModel model(1.0, -1.0, -5.0);
+
+ string qL, rL;
+ string qR, rR;
+
+ basicPairwiseAlignSeqs(query, parent1, qL, rL, model);
+ basicPairwiseAlignSeqs(query, parent2, qR, rR, model);
+
+ int lLength = qL.length();
+ int rLength = qR.length();
+
+ string qLNew, rLNew;
+ string qRNew, rRNew;
+
+ int lIndex = 0;
+ int rIndex = 0;
+
+ while(lIndex<lLength || rIndex<rLength){
+
+ if (m->control_pressed) { return 0; }
+
+ if(qL[lIndex] == qR[rIndex]){
+ qLNew += qL[lIndex];
+ rLNew += rL[lIndex];
+ lIndex++;
+
+ qRNew += qR[rIndex];
+ rRNew += rR[rIndex];
+ rIndex++;
+ }
+ else if(qL[lIndex] == '-' || qL[lIndex] == '.'){
+ //insert a gap into the right sequences
+ qLNew += qL[lIndex];
+ rLNew += rL[lIndex];
+ lIndex++;
+
+ if(rIndex != rLength){
+ qRNew += '-';
+ rRNew += '-';
+ }
+ else{
+ qRNew += '.';
+ rRNew += '.';
+ }
+ }
+ else if(qR[rIndex] == '-' || qR[rIndex] == '.'){
+ //insert a gap into the left sequences
+ qRNew += qR[rIndex];
+ rRNew += rR[rIndex];
+ rIndex++;
+
+
+ if(lIndex != lLength){
+ qLNew += '-';
+ rLNew += '-';
+ }
+ else{
+ qLNew += '.';
+ rLNew += '.';
+ }
+
+ }
+ }
+
+ qAlign = qLNew;
+ aAlign = rLNew;
+ bAlign = rRNew;
+
+ bool qStart = 0;
+ bool aStart = 0;
+ bool bStart = 0;
+
+ for(int i=0;i<qAlign.length();i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(qStart == 0){
+ if(qAlign[i] == '-') { qAlign[i] = '.'; }
+ else { qStart = 1; }
+ }
+ if(aStart == 0){
+ if(aAlign[i] == '-') { aAlign[i] = '.'; }
+ else { aStart = 1; }
+ }
+ if(bStart == 0){
+ if(bAlign[i] == '-') { bAlign[i] = '.'; }
+ else { bStart = 1; }
+ }
+ if(aStart == 1 && bStart == 1 && qStart == 1){
+ break;
+ }
+ }
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "threeWayAlign");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double Perseus::calcLoonIndex(string query, string parent1, string parent2, int breakPoint, vector<vector<double> >& binMatrix){
+ try {
+ string queryAln, leftParentAln, rightParentAln;
+ threeWayAlign(query, parent1, parent2, queryAln, leftParentAln, rightParentAln);
+
+ int alignLength = queryAln.length();
+
+ int endPos = alignLength;
+ for(int i=alignLength-1;i>=0; i--){
+ if(queryAln[i] != '.' && leftParentAln[i] != '.' && rightParentAln[i] != '.'){
+ endPos = i + 1;
+ break;
+ }
+ }
+
+ int diffToLeftCount = 0;
+ vector<int> diffToLeftMap(alignLength, 0);
+
+ int diffToRightCount = 0;
+ vector<int> diffToRightMap(alignLength, 0);
+
+ for(int i=0;i<endPos;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(queryAln[i] != leftParentAln[i]){
+ diffToLeftMap[diffToLeftCount] = i;
+ diffToLeftCount++;
+ }
+
+ if(queryAln[i] != rightParentAln[i]){
+ diffToRightMap[diffToRightCount] = i;
+ diffToRightCount++;
+ }
+ }
+
+
+
+ diffToLeftMap[diffToLeftCount] = endPos;
+ diffToRightMap[diffToRightCount] = endPos;
+
+ int indexL = 0;
+ int indexR = 0;
+ int indexS = 0;
+
+ vector<int> diffs;
+ vector<int> splits;
+
+ splits.push_back(-1);
+ diffs.push_back(diffToRightCount);
+ indexS++;
+
+ while(indexL < diffToLeftCount || indexR < diffToRightCount){
+
+ if (m->control_pressed) { return 0; }
+
+ if(diffToLeftMap[indexL] <= diffToRightMap[indexR]){
+ diffs.push_back(diffs[indexS - 1] + 1);
+ splits.push_back(diffToLeftMap[indexL]);
+
+ indexL++;
+ indexS++;
+ }
+ else if(diffToLeftMap[indexL] > diffToRightMap[indexR]) {
+ diffs.push_back(diffs[indexS - 1] - 1);
+ splits.push_back(diffToRightMap[indexR]);
+
+ indexR++;
+ indexS++;
+ }
+ }
+
+ int minDiff = PERSEUSMAXINT;
+ int minIndex = -1;
+ for(int i=0;i<indexS;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(diffs[i] < minDiff){
+ minDiff = diffs[i];
+ minIndex = i;
+ }
+ }
+
+ int splitPos = endPos;
+ if(minIndex < indexS - 1){
+ splitPos = (splits[minIndex]+splits[minIndex+1]) / 2;
+ }
+
+ int diffToChimera = 0;
+ int leftDiffToP1 = 0;
+ int rightDiffToP1 = 0;
+ int leftDiffToP2 = 0;
+ int rightDiffToP2 = 0;
+
+ for(int i=0;i<endPos;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ char bQuery = queryAln[i];
+ char bP1 = leftParentAln[i];
+ char bP2 = rightParentAln[i];
+
+ char bConsensus = bQuery;
+ if(bP1 == bP2){ bConsensus = bP1; }
+
+ if(bConsensus != bQuery){
+ diffToChimera++;
+ }
+
+ if(bConsensus != bP1){
+ if(i <= splitPos){
+ leftDiffToP1++;
+ }
+ else{
+ rightDiffToP1++;
+ }
+ }
+ if(bConsensus != bP2){
+ if(i <= splitPos){
+ leftDiffToP2++;
+ }
+ else{
+ rightDiffToP2++;
+ }
+ }
+ }
+
+
+ int diffToClosestParent, diffToFurtherParent;
+ int xA, xB, yA, yB;
+ double aFraction, bFraction;
+
+ if(diffToLeftCount <= diffToRightCount){ //if parent 1 is closer
+
+ diffToClosestParent = leftDiffToP1 + rightDiffToP1;
+ xA = leftDiffToP1;
+ xB = rightDiffToP1;
+
+ diffToFurtherParent = leftDiffToP2 + rightDiffToP2;
+ yA = leftDiffToP2;
+ yB = rightDiffToP2;
+
+ aFraction = double(splitPos + 1)/(double) endPos;
+ bFraction = 1 - aFraction;
+
+ }
+ else{ //if parent 2 is closer
+
+ diffToClosestParent = leftDiffToP2 + rightDiffToP2;
+ xA = rightDiffToP2;
+ xB = leftDiffToP2;
+
+ diffToFurtherParent = leftDiffToP1 + rightDiffToP1;
+ yA = rightDiffToP1;
+ yB = leftDiffToP1;
+
+ bFraction = double(splitPos + 1)/(double) endPos;
+ aFraction = 1 - bFraction;
+
+ }
+
+ double loonIndex = 0;
+
+ int totalDifference = diffToClosestParent + diffToChimera;
+
+ if(totalDifference > 0){
+ double prob = 0;
+
+ for(int i=diffToClosestParent;i<=totalDifference;i++){
+ prob += binMatrix[totalDifference][i] * pow(0.50, i) * pow(0.50, totalDifference - i);
+ }
+ loonIndex += -log(prob);
+ }
+
+ if(diffToFurtherParent > 0){
+ double prob = 0;
+
+ for(int i=yA;i<=diffToFurtherParent;i++){
+ prob += binMatrix[diffToFurtherParent][i] * pow(aFraction, i) * pow(1-aFraction, diffToFurtherParent - i);
+ }
+ loonIndex += -log(prob);
+ }
+
+ if(diffToClosestParent > 0){
+ double prob = 0;
+
+ for(int i=xB;i<=diffToClosestParent;i++){
+ prob += binMatrix[diffToClosestParent][i] * pow(bFraction, i) * pow(1-bFraction, diffToClosestParent - i);
+ }
+ loonIndex += -log(prob);
+ }
+
+ return loonIndex;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "calcLoonIndex");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double Perseus::calcBestDistance(string query, string reference){
+ try {
+ int alignLength = query.length();
+ int mismatch = 0;
+ int counter = 0;
+
+ for(int i=0;i<alignLength;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if((query[i] != '.' || reference[i] != '.') && (query[i] != '-' && reference[i] != '-')){
+ if(query[i] != reference[i]){ mismatch++; }
+ counter++;
+ }
+ }
+
+ return (double)mismatch / (double)counter;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "calcBestDistance");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double Perseus::classifyChimera(double singleDist, double cIndex, double loonIndex, double alpha, double beta){
+ try {
+ double difference = cIndex - singleDist; //y
+ double probability;
+
+ if(cIndex >= 0.15 || difference > 0.00){
+ probability = 0.0000;
+ }
+ else{
+ probability = 1.0 / (1.0 + exp(-(alpha + beta * loonIndex)));
+ }
+
+ return probability;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Perseus", "classifyChimera");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
--- /dev/null
+#ifndef MOTHURPERSEUS
+#define MOTHURPERSEUS
+
+/*
+ * myPerseus.h
+ *
+ *
+ * Created by Pat Schloss on 9/5/11.
+ * Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+
+#include "mothurout.h"
+
+/**************************************************************************************************/
+struct seqData {
+
+ seqData(string name, string seq, int freq) : seqName(name), sequence(seq), frequency(freq) { }
+
+ bool operator<( seqData const& rhs ) const {
+
+ bool verdict = 0;
+
+ if(frequency < rhs.frequency){
+ verdict = 1;
+ }
+ else if(frequency == rhs.frequency){
+ verdict = (seqName > rhs.seqName);
+ }
+
+ return verdict;
+ }
+
+ string seqName;
+ string sequence;
+ int frequency;
+};
+/**************************************************************************************************/
+struct pwModel {
+ pwModel(double m, double mm, double g): MATCH(m), MISMATCH(mm), GAP_OPEN(g) {;}
+ double MATCH;
+ double MISMATCH;
+ double GAP_OPEN;
+};
+/**************************************************************************************************/
+struct pwAlign {
+ pwAlign(): query(""), reference(""){}
+ pwAlign(string q, string r): query(q), reference(r){}
+ string query;
+ string reference;
+
+};
+/**************************************************************************************************/
+class Perseus {
+
+public:
+ Perseus() { m = MothurOut::getInstance(); }
+ ~Perseus() {}
+
+ vector<vector<double> > binomial(int);
+ double modeledPairwiseAlignSeqs(string, string, string&, string&, vector<vector<double> >&);
+ int getAlignments(int, vector<seqData>, vector<pwAlign>&, vector<vector<int> >& , vector<vector<int> >&, vector<vector<int> >&, vector<vector<int> >&, int&, int&, vector<bool>&);
+ int getChimera(vector<seqData>,vector<vector<int> >&, vector<vector<int> >&,int&, int&, int&,vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<bool>);
+ string stitchBimera(vector<pwAlign>&, int, int, int, vector<vector<int> >&, vector<vector<int> >&);
+ int getTrimera(vector<seqData>&, vector<vector<int> >&, int&, int&, int&, int&, int&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<bool>);
+ string stitchTrimera(vector<pwAlign>, int, int, int, int, int, vector<vector<int> >&, vector<vector<int> >&);
+ double calcLoonIndex(string, string, string, int, vector<vector<double> >&);
+ double classifyChimera(double, double, double, double, double);
+
+private:
+ MothurOut* m;
+ int toInt(char);
+ double basicPairwiseAlignSeqs(string, string, string&, string&, pwModel);
+ int getDiffs(string, string, vector<int>&, vector<int>&, vector<int>&, vector<int>&);
+ int getLastMatch(char, vector<vector<char> >&, int, int, string&, string&);
+ int threeWayAlign(string, string, string, string&, string&, string&);
+ double calcBestDistance(string, string);
+
+
+};
+/**************************************************************************************************/
+#endif
+
+
int level = 0;
//are there confidence scores, if so remove them
- if (seqTaxonomy.find_first_of('(') != -1) { removeConfidences(seqTaxonomy); }
+ if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); }
while (seqTaxonomy != "") {
int level = 0;
//are there confidence scores, if so remove them
- if (seqTaxonomy.find_first_of('(') != -1) { removeConfidences(seqTaxonomy); }
+ if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); }
while (seqTaxonomy != "") {
}
}
/**************************************************************************************************/
-void PhyloSummary::removeConfidences(string& tax) {
- try {
-
- string taxon;
- string newTax = "";
-
- while (tax.find_first_of(';') != -1) {
- //get taxon
- taxon = tax.substr(0,tax.find_first_of(';'));
-
- int pos = taxon.find_first_of('(');
- if (pos != -1) {
- taxon = taxon.substr(0, pos); //rip off confidence
- }
-
- taxon += ";";
-
- tax = tax.substr(tax.find_first_of(';')+1, tax.length());
- newTax += taxon;
- }
-
- tax = newTax;
- }
- catch(exception& e) {
- m->errorOut(e, "PhyloSummary", "removeConfidences");
- exit(1);
- }
-}
-/**************************************************************************************************/
int numSeqs;
int maxLevel;
MothurOut* m;
-
- void removeConfidences(string&);
};
/**************************************************************************************************/
if (hasConPos != string::npos) {
taxonsHasConfidence[i] = true;
searchTaxons[i] = getTaxons(listOfTaxons[i]);
- noConfidenceTaxons[i] = removeConfidences(listOfTaxons[i]);
+ noConfidenceTaxons[i] = listOfTaxons[i];
+ m->removeConfidences(noConfidenceTaxons[i]);
}
}
int hasConfidences = tax.find_first_of('(');
if (hasConfidences != string::npos) {
- newtax = removeConfidences(tax);
+ newtax = tax;
+ m->removeConfidences(newtax);
}
int pos = newtax.find(noConfidenceTaxons[j]);
string noNewTax = tax;
int hasConfidences = tax.find_first_of('(');
if (hasConfidences != string::npos) {
- noNewTax = removeConfidences(tax);
+ noNewTax = tax;
+ m->removeConfidences(noNewTax);
}
int pos = noNewTax.find(noConfidenceTaxons[j]);
exit(1);
}
}
-/**************************************************************************************************/
-string RemoveLineageCommand::removeConfidences(string tax) {
- try {
-
- string taxon = "";
- int taxLength = tax.length();
- for(int i=0;i<taxLength;i++){
- if(tax[i] == ';'){
- taxon = taxon.substr(0, taxon.find_first_of('(')); //rip off confidence
- taxon += ";";
- }
- else{
- taxon += tax[i];
- }
- }
-
- return taxon;
- }
- catch(exception& e) {
- m->errorOut(e, "RemoveLineageCommand", "removeConfidences");
- exit(1);
- }
-}
//**********************************************************************************************************************
//alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name
int RemoveLineageCommand::readAlign(){
int readAlign();
int readList();
int readTax();
- string removeConfidences(string);
vector< map<string, float> > getTaxons(string);
};
in >> name >> tax; m->gobble(in);
- if (containsConfidence) { removeConfidences(tax); }
+ if (containsConfidence) { m->removeConfidences(tax); }
//is this a taxonomy that needs to be extended?
if (seqLevels[name] < highestLevel) {
}
}
/**************************************************************************************************/
-void TaxEqualizer::removeConfidences(string& tax) {
- try {
-
- string taxon;
- string newTax = "";
-
- while (tax.find_first_of(';') != -1) {
- //get taxon
- taxon = tax.substr(0,tax.find_first_of(';'));
- taxon = taxon.substr(0, taxon.find_first_of('(')); //rip off confidence
- taxon += ";";
-
- tax = tax.substr(tax.find_first_of(';')+1, tax.length());
- newTax += taxon;
- }
-
- tax = newTax;
- }
- catch(exception& e) {
- m->errorOut(e, "TaxEqualizer", "removeConfidences");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
int getHighestLevel(ifstream&); //scans taxonomy file to find taxonomy with highest level
void extendTaxonomy(string, string&, int); //name, taxonomy, desired level
void truncateTaxonomy(string, string&, int); //name, taxonomy, desired level
- void removeConfidences(string&); //removes the confidence limits on the taxon
MothurOut* m;
};