From 74dc92cf53df65fd8b14d8eaf35489bbecbccac6 Mon Sep 17 00:00:00 2001 From: westcott Date: Mon, 7 Nov 2011 16:22:50 +0000 Subject: [PATCH] working on chimera.perseus. made removeConfidences function smarter. Fixed bug in mothurmetastats.cpp that stemmed from the original source. Indexes were off, became noticeable because we sort group names. --- Mothur.xcodeproj/project.pbxproj | 12 + chimeraperseuscommand.cpp | 1024 ++++++++++++++++++++++++++++ chimeraperseuscommand.h | 325 +++++++++ chimerauchimecommand.cpp | 4 +- classifyotucommand.cpp | 34 +- classifyotucommand.h | 1 - commandfactory.cpp | 6 + getlineagecommand.cpp | 32 +- getlineagecommand.h | 1 - metastatscommand.cpp | 5 +- mothurmetastats.cpp | 36 +- mothurout.cpp | 40 ++ mothurout.h | 2 + myPerseus.cpp | 1073 ++++++++++++++++++++++++++++++ myPerseus.h | 85 +++ phylosummary.cpp | 33 +- phylosummary.h | 2 - removelineagecommand.cpp | 32 +- removelineagecommand.h | 1 - taxonomyequalizer.cpp | 27 +- taxonomyequalizer.h | 1 - 21 files changed, 2607 insertions(+), 169 deletions(-) create mode 100644 chimeraperseuscommand.cpp create mode 100644 chimeraperseuscommand.h create mode 100644 myPerseus.cpp create mode 100644 myPerseus.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 2da1ed5..a756931 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -49,6 +49,8 @@ A795840D13F13CD900F201D5 /* countgroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A795840C13F13CD900F201D5 /* countgroupscommand.cpp */; }; A799F5B91309A3E000AEEFA0 /* makefastqcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A799F5B81309A3E000AEEFA0 /* makefastqcommand.cpp */; }; A7A61F2D130062E000E05B6B /* amovacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A61F2C130062E000E05B6B /* amovacommand.cpp */; }; + A7BF221414587886000AD524 /* myPerseus.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF221214587886000AD524 /* myPerseus.cpp */; }; + A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */; }; A7E9B88112D37EC400DA6239 /* ace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B64F12D37EC300DA6239 /* ace.cpp */; }; A7E9B88212D37EC400DA6239 /* aligncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65112D37EC300DA6239 /* aligncommand.cpp */; }; A7E9B88312D37EC400DA6239 /* alignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65312D37EC300DA6239 /* alignment.cpp */; }; @@ -432,6 +434,10 @@ A7A61F2B130062E000E05B6B /* amovacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = amovacommand.h; sourceTree = ""; }; A7A61F2C130062E000E05B6B /* amovacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = amovacommand.cpp; sourceTree = ""; }; A7AACFBA132FE008003D6C4D /* currentfile.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = currentfile.h; sourceTree = ""; }; + A7BF221214587886000AD524 /* myPerseus.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = myPerseus.cpp; sourceTree = ""; }; + A7BF221314587886000AD524 /* myPerseus.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = myPerseus.h; sourceTree = ""; }; + A7BF2230145879B2000AD524 /* chimeraperseuscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraperseuscommand.h; sourceTree = ""; }; + A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraperseuscommand.cpp; sourceTree = ""; }; A7DAAFA3133A254E003956EB /* commandparameter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = commandparameter.h; sourceTree = ""; }; A7E9B64F12D37EC300DA6239 /* ace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ace.cpp; sourceTree = ""; }; A7E9B65012D37EC300DA6239 /* ace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ace.h; sourceTree = ""; }; @@ -1214,6 +1220,8 @@ A7E9B67E12D37EC400DA6239 /* chimeracheckcommand.cpp */, A7E9B68312D37EC400DA6239 /* chimerapintailcommand.h */, A7E9B68212D37EC400DA6239 /* chimerapintailcommand.cpp */, + A7BF2230145879B2000AD524 /* chimeraperseuscommand.h */, + A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */, A7E9B68B12D37EC400DA6239 /* chimeraslayercommand.h */, A7E9B68A12D37EC400DA6239 /* chimeraslayercommand.cpp */, A74D36B6137DAFAA00332B0C /* chimerauchimecommand.h */, @@ -1688,6 +1696,8 @@ A7E9B68912D37EC400DA6239 /* chimeraslayer.h */, A7E9B74612D37EC400DA6239 /* maligner.h */, A7E9B74512D37EC400DA6239 /* maligner.cpp */, + A7BF221314587886000AD524 /* myPerseus.h */, + A7BF221214587886000AD524 /* myPerseus.cpp */, A7E9B79312D37EC400DA6239 /* pintail.cpp */, A7E9B79412D37EC400DA6239 /* pintail.h */, A7E9B82E12D37EC400DA6239 /* slayer.cpp */, @@ -2155,6 +2165,8 @@ A7FF19F2140FFDA500AD216D /* trimoligos.cpp in Sources */, A7F9F5CF141A5E500032F693 /* sequenceparser.cpp in Sources */, A7FFB558142CA02C004884F2 /* summarytaxcommand.cpp in Sources */, + A7BF221414587886000AD524 /* myPerseus.cpp in Sources */, + A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp new file mode 100644 index 0000000..240ba65 --- /dev/null +++ b/chimeraperseuscommand.cpp @@ -0,0 +1,1024 @@ +/* + * chimeraperseuscommand.cpp + * Mothur + * + * Created by westcott on 10/26/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "chimeraperseuscommand.h" +#include "deconvolutecommand.h" +#include "sequence.hpp" +//********************************************************************************************************************** +vector ChimeraPerseusCommand::setParameters(){ + try { + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter pcutoff("cutoff", "Number", "", "0.5", "", "", "",false,false); parameters.push_back(pcutoff); + CommandParameter palpha("alpha", "Number", "", "-5.54", "", "", "",false,false); parameters.push_back(palpha); + CommandParameter pbeta("beta", "Number", "", "0.33", "", "", "",false,false); parameters.push_back(pbeta); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string ChimeraPerseusCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The chimera.perseus command reads a fastafile and namefile and outputs potentially chimeric sequences.\n"; + helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n"; + helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; + helpString += "The name parameter allows you to provide a name file associated with your fasta file. If none is given unique.seqs will be run to generate one. \n"; + helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; + helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; + helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; + helpString += "The alpha parameter .... The default is -5.54. \n"; + helpString += "The beta parameter .... The default is 0.33. \n"; + helpString += "The cutoff parameter .... The default is 0.50. \n"; + helpString += "The chimera.perseus command should be in the following format: \n"; + helpString += "chimera.perseus(fasta=yourFastaFile, name=yourNameFile) \n"; + helpString += "Example: chimera.perseus(fasta=AD.align, name=AD.names) \n"; + helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "getHelpString"); + exit(1); + } +} +//********************************************************************************************************************** +ChimeraPerseusCommand::ChimeraPerseusCommand(){ + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["chimera"] = tempOutNames; + outputTypes["accnos"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "ChimeraPerseusCommand"); + exit(1); + } +} +//*************************************************************************************************************** +ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { + try { + abort = false; calledHelp = false; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter("chimera.uchime"); + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + vector tempOutNames; + outputTypes["chimera"] = tempOutNames; + outputTypes["accnos"] = tempOutNames; + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", false); + if (fastafile == "not found") { + //if there is a current fasta file, use it + string filename = m->getFastaFile(); + if (filename != "") { fastaFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the fasta parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; } + }else { + m->splitAtDash(fastafile, fastaFileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < fastaFileNames.size(); i++) { + + bool ignore = false; + if (fastaFileNames[i] == "current") { + fastaFileNames[i] = m->getFastaFile(); + if (fastaFileNames[i] != "") { m->mothurOut("Using " + fastaFileNames[i] + " as input file for the fasta parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current fastafile, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + fastaFileNames.erase(fastaFileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(fastaFileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]); + m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + fastaFileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]); + m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + fastaFileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + fastaFileNames.erase(fastaFileNames.begin()+i); + i--; + }else { + m->setFastaFile(fastaFileNames[i]); + } + } + } + + //make sure there is at least one valid file left + if (fastaFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; } + } + + + //check for required parameters + bool hasName = true; + namefile = validParameter.validFile(parameters, "name", false); + if (namefile == "not found") { + //if there is a current fasta file, use it + string filename = m->getNameFile(); + if (filename != "") { nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; } + hasName = false; + }else { + m->splitAtDash(namefile, nameFileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < nameFileNames.size(); i++) { + + bool ignore = false; + if (nameFileNames[i] == "current") { + nameFileNames[i] = m->getNameFile(); + if (nameFileNames[i] != "") { m->mothurOut("Using " + nameFileNames[i] + " as input file for the name parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + nameFileNames.erase(nameFileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(nameFileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { nameFileNames[i] = inputDir + nameFileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(nameFileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(nameFileNames[i]); + m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + nameFileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(nameFileNames[i]); + m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + nameFileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + nameFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + nameFileNames.erase(nameFileNames.begin()+i); + i--; + }else { + m->setNameFile(nameFileNames[i]); + } + } + } + + //make sure there is at least one valid file left + if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; } + } + + if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + + bool hasGroup = true; + groupfile = validParameter.validFile(parameters, "group", false); + if (groupfile == "not found") { groupfile = ""; hasGroup = false; } + else { + m->splitAtDash(groupfile, groupFileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < groupFileNames.size(); i++) { + + bool ignore = false; + if (groupFileNames[i] == "current") { + groupFileNames[i] = m->getGroupFile(); + if (groupFileNames[i] != "") { m->mothurOut("Using " + groupFileNames[i] + " as input file for the group parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + groupFileNames.erase(groupFileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(groupFileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { groupFileNames[i] = inputDir + groupFileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(groupFileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(groupFileNames[i]); + m->mothurOut("Unable to open " + groupFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + groupFileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(groupFileNames[i]); + m->mothurOut("Unable to open " + groupFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + groupFileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + groupFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + groupFileNames.erase(groupFileNames.begin()+i); + i--; + }else { + m->setGroupFile(groupFileNames[i]); + } + } + } + + //make sure there is at least one valid file left + if (groupFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid group files."); m->mothurOutEndLine(); abort = true; } + } + + if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } + m->setProcessors(temp); + convert(temp, processors); + + temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.50"; } + convert(temp, cutoff); + + temp = validParameter.validFile(parameters, "alpha", false); if (temp == "not found"){ temp = "-5.54"; } + convert(temp, alpha); + + temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.33"; } + convert(temp, beta); + } + } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "ChimeraPerseusCommand"); + exit(1); + } +} +//*************************************************************************************************************** + +int ChimeraPerseusCommand::execute(){ + try{ + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + + //process each file + for (int s = 0; s < fastaFileNames.size(); s++) { + + m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine(); + + int start = time(NULL); + if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it + string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "perseus.chimera"; + string accnosFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "perseus.accnos"; + //string newFasta = m->getRootName(fastaFileNames[s]) + "temp"; + + //you provided a groupfile + string groupFile = ""; + if (groupFileNames.size() != 0) { groupFile = groupFileNames[s]; } + + string nameFile = ""; + if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one + nameFile = nameFileNames[s]; + }else { nameFile = getNamesFile(fastaFileNames[s]); } + + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + int numSeqs = 0; + int numChimeras = 0; + + if (groupFile != "") { + //Parse sequences by group + SequenceParser parser(groupFile, fastaFileNames[s], nameFile); + vector groups = parser.getNamesOfGroups(); + + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + //clears files + ofstream out, out1, out2; + m->openOutputFile(outputFileName, out); out.close(); + m->openOutputFile(accnosFileName, out1); out1.close(); + + if(processors == 1) { numSeqs = driverGroups(parser, outputFileName, accnosFileName, 0, groups.size(), groups); } + else { numSeqs = createProcessesGroups(parser, outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); } + + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + numChimeras = deconvoluteResults(parser, outputFileName, accnosFileName); + + m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); + + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + }else{ + if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } + + //read sequences and store sorted by frequency + vector sequences = readFiles(fastaFileNames[s], nameFile); + + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras); + } + + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences. " + toString(numChimeras) + " chimeras were found."); m->mothurOutEndLine(); + outputNames.push_back(outputFileName); outputTypes["chimera"].push_back(outputFileName); + outputNames.push_back(accnosFileName); outputTypes["accnos"].push_back(accnosFileName); + } + + //set accnos file as new current accnosfile + string current = ""; + itTypes = outputTypes.find("accnos"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); } + } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +string ChimeraPerseusCommand::getNamesFile(string& inputFile){ + try { + string nameFile = ""; + + m->mothurOutEndLine(); m->mothurOut("No namesfile given, running unique.seqs command to generate one."); m->mothurOutEndLine(); m->mothurOutEndLine(); + + //use unique.seqs to create new name and fastafile + string inputString = "fasta=" + inputFile; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); + + Command* uniqueCommand = new DeconvoluteCommand(inputString); + uniqueCommand->execute(); + + map > filenames = uniqueCommand->getOutputFiles(); + + delete uniqueCommand; + + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + nameFile = filenames["name"][0]; + inputFile = filenames["fasta"][0]; + + return nameFile; + } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile"); + exit(1); + } +} +//********************************************************************************************************************** +int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFName, string accnos, int start, int end, vector groups){ + try { + + int totalSeqs = 0; + int numChimeras = 0; + + for (int i = start; i < end; i++) { + + m->mothurOutEndLine(); m->mothurOut("Checking sequences from group " + groups[i] + "..."); m->mothurOutEndLine(); + + int start = time(NULL); if (m->control_pressed) { return 0; } + + vector sequences = loadSequences(parser, groups[i]); + + if (m->control_pressed) { return 0; } + + int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); + totalSeqs += numSeqs; + + if (m->control_pressed) { return 0; } + + //append files + m->appendFiles((outputFName+groups[i]), outputFName); m->mothurRemove((outputFName+groups[i])); + m->appendFiles((accnos+groups[i]), accnos); m->mothurRemove((accnos+groups[i])); + + m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + groups[i] + "."); m->mothurOutEndLine(); + } + + return totalSeqs; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "driverGroups"); + exit(1); + } +} +//********************************************************************************************************************** +vector ChimeraPerseusCommand::loadSequences(SequenceParser& parser, string group){ + try { + + vector thisGroupsSeqs = parser.getSeqs(group); + map nameMap = parser.getNameMap(group); + map::iterator it; + + vector sequences; + bool error = false; + + for (int i = 0; i < thisGroupsSeqs.size(); i++) { + + if (m->control_pressed) { return sequences; } + + it = nameMap.find(thisGroupsSeqs[i].getName()); + if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); } + else { + int num = m->getNumNames(it->second); + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num)); + } + } + + if (error) { m->control_pressed = true; } + + //sort by frequency + sort(sequences.rbegin(), sequences.rend()); + + return sequences; + } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "loadSequences"); + exit(1); + } +} + +//********************************************************************************************************************** +vector ChimeraPerseusCommand::readFiles(string inputFile, string name){ + try { + map::iterator it; + map nameMap = m->readNames(name); + + //read fasta file and create sequenceData structure - checking for file mismatches + vector sequences; + bool error = false; + ifstream in; + m->openInputFile(inputFile, in); + + while (!in.eof()) { + + if (m->control_pressed) { in.close(); return sequences; } + + Sequence temp(in); m->gobble(in); + + it = nameMap.find(temp.getName()); + if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + temp.getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); } + else { + sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), it->second)); + } + } + in.close(); + + if (error) { m->control_pressed = true; } + + //sort by frequency + sort(sequences.rbegin(), sequences.rend()); + + return sequences; + } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile"); + exit(1); + } +} +//********************************************************************************************************************** +int ChimeraPerseusCommand::driver(string chimeraFileName, vector& sequences, string accnosFileName, int& numChimeras){ + try { + + vector > correctModel(4); //could be an option in the future to input own model matrix + for(int i=0;i<4;i++){ correctModel[i].resize(4); } + + correctModel[0][0] = 0.000000; //AA + correctModel[1][0] = 11.619259; //CA + correctModel[2][0] = 11.694004; //TA + correctModel[3][0] = 7.748623; //GA + + correctModel[1][1] = 0.000000; //CC + correctModel[2][1] = 7.619657; //TC + correctModel[3][1] = 12.852562; //GC + + correctModel[2][2] = 0.000000; //TT + correctModel[3][2] = 10.964048; //TG + + correctModel[3][3] = 0.000000; //GG + + for(int i=0;i<4;i++){ + for(int j=0;jopenOutputFile(chimeraFileName, chimeraFile); + m->openOutputFile(accnosFileName, accnosFile); + + Perseus myPerseus; + vector > binMatrix = myPerseus.binomial(alignLength); + + chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n"; + + vector chimeras(numSeqs, 0); + + for(int i=0;icontrol_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } + + vector restricted = chimeras; + + vector > leftDiffs(numSeqs); + vector > leftMaps(numSeqs); + vector > rightDiffs(numSeqs); + vector > rightMaps(numSeqs); + + vector singleLeft, bestLeft; + vector singleRight, bestRight; + + int bestSingleIndex, bestSingleDiff; + vector alignments(numSeqs); + + int comparisons = myPerseus.getAlignments(i, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted); + cout << "comparisons = " << comparisons << endl; + if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } + + int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi; + + string dummyA, dummyB; + + if(comparisons >= 2){ + minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted); + cout << "minMismatchToChimera = " << minMismatchToChimera << endl; + if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } + + int minMismatchToTrimera = numeric_limits::max(); + int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB; + + if(minMismatchToChimera >= 3 && comparisons >= 3){ + minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted); + cout << "minMismatchToTrimera = " << minMismatchToTrimera << endl; + if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } + } + + double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel); + cout << "singleDist = " << singleDist << endl; + if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } + + string type; + string chimeraRefSeq; + + if(minMismatchToChimera - minMismatchToTrimera >= 3){ + type = "trimera"; + chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps); + } + else{ + type = "chimera"; + chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps); + } + cout << "chimeraRefSeq = " << chimeraRefSeq << endl; + if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } + + double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel); + cout << "chimeraDist = " << chimeraDist << endl; + if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } + + double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq); + double loonIndex = myPerseus.calcLoonIndex(sequences[i].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix); + cout << "loonIndex = " << loonIndex << endl; + if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } + + chimeraFile << i << '\t' << sequences[i].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t'; + chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t'; + chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t'; + chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t'; + + double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, alpha, beta); + + chimeraFile << probability << '\t'; + + if(probability > cutoff){ + chimeraFile << type << endl; + accnosFile << sequences[i].seqName << endl; + chimeras[i] = 1; + numChimeras++; + } + else{ + chimeraFile << "good" << endl; + } + + } + else{ + chimeraFile << i << '\t' << sequences[i].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl; + } + + //report progress + if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1) + "\n"); } + } + + if((numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n"); } + + chimeraFile.close(); + accnosFile.close(); + + return numSeqs; + } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "driver"); + exit(1); + } +} +/**************************************************************************************************/ +int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string accnos, vector groups, string group, string fasta, string name) { + try { + + vector processIDS; + int process = 1; + int num = 0; + + //sanity check + if (groups.size() < processors) { processors = groups.size(); } + + //divide the groups between the processors + vector lines; + int numGroupsPerProcessor = groups.size() / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numGroupsPerProcessor; + int endIndex = (i+1) * numGroupsPerProcessor; + if(i == (processors - 1)){ endIndex = groups.size(); } + lines.push_back(linePair(startIndex, endIndex)); + } + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + num = driverGroups(parser, outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups); + + //pass numSeqs to parent + ofstream out; + string tempFile = outputFName + toString(getpid()) + ".num.temp"; + m->openOutputFile(tempFile, out); + out << num << endl; + out.close(); + + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } + + //do my part + num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups); + + //force parent to wait until all the processes are done + for (int i=0;iopenInputFile(tempFile, in); + if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } + in.close(); m->mothurRemove(tempFile); + } + +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the preClusterData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=1; icount; + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } +#endif + + + //append output files + for(int i=0;iappendFiles((outputFName + toString(processIDS[i]) + ".temp"), outputFName); + m->mothurRemove((outputFName + toString(processIDS[i]) + ".temp")); + + m->appendFiles((accnos + toString(processIDS[i]) + ".temp"), accnos); + m->mothurRemove((accnos + toString(processIDS[i]) + ".temp")); + } + + return num; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "createProcessesGroups"); + exit(1); + } +} +//********************************************************************************************************************** +int ChimeraPerseusCommand::deconvoluteResults(SequenceParser& parser, string outputFileName, string accnosFileName){ + try { + map uniqueNames = parser.getAllSeqsMap(); + map::iterator itUnique; + int total = 0; + + //edit accnos file + ifstream in2; + m->openInputFile(accnosFileName, in2); + + ofstream out2; + m->openOutputFile(accnosFileName+".temp", out2); + + string name; + set namesInFile; //this is so if a sequence is found to be chimera in several samples we dont write it to the results file more than once + set::iterator itNames; + set chimerasInFile; + set::iterator itChimeras; + + + while (!in2.eof()) { + if (m->control_pressed) { in2.close(); out2.close(); m->mothurRemove(outputFileName); m->mothurRemove((accnosFileName+".temp")); return 0; } + + in2 >> name; m->gobble(in2); + + //find unique name + itUnique = uniqueNames.find(name); + + if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; } + else { + itChimeras = chimerasInFile.find((itUnique->second)); + + if (itChimeras == chimerasInFile.end()) { + out2 << itUnique->second << endl; + chimerasInFile.insert((itUnique->second)); + total++; + } + } + } + in2.close(); + out2.close(); + + m->mothurRemove(accnosFileName); + rename((accnosFileName+".temp").c_str(), accnosFileName.c_str()); + + //edit chimera file + ifstream in; + m->openInputFile(outputFileName, in); + + ofstream out; + m->openOutputFile(outputFileName+".temp", out); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); + + int DiffsToBestMatch, BestMatchIndex, DiffstToChimera, IndexofLeftParent, IndexOfRightParent; + float temp1,temp2, temp3, temp4, temp5, temp6, temp7, temp8; + string index, BestMatchName, parent1, parent2, flag; + name = ""; + namesInFile.clear(); + //assumptions - in file each read will always look like + /* + SequenceIndex Name DiffsToBestMatch BestMatchIndex BestMatchName DiffstToChimera IndexofLeftParent IndexOfRightParent NameOfLeftParent NameOfRightParent DistanceToBestMatch cIndex (cIndex - singleDist) loonIndex MismatchesToChimera MismatchToTrimera ChimeraBreakPoint LogisticProbability TypeOfSequence + 0 F01QG4L02JVBQY 0 0 Null 0 0 0 Null Null 0.0 0.0 0.0 0.0 0 0 0 0.0 0.0 good + 1 F01QG4L02ICTC6 0 0 Null 0 0 0 Null Null 0.0 0.0 0.0 0.0 0 0 0 0.0 0.0 good + 2 F01QG4L02JZOEC 48 0 F01QG4L02JVBQY 47 0 0 F01QG4L02JVBQY F01QG4L02JVBQY 2.0449 2.03545 -0.00944493 0 47 2147483647 138 0 good + 3 F01QG4L02G7JEC 42 0 F01QG4L02JVBQY 40 1 0 F01QG4L02ICTC6 F01QG4L02JVBQY 1.87477 1.81113 -0.0636404 5.80145 40 2147483647 25 0 good + */ + + //get and print headers + BestMatchName = m->getline(in); m->gobble(in); + out << BestMatchName << endl; + + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove((outputFileName+".temp")); return 0; } + + bool print = false; + in >> index; m->gobble(in); + + if (index != "SequenceIndex") { //if you are not a header line, there will be a header line for each group if group file is given + in >> name; m->gobble(in); + in >> DiffsToBestMatch; m->gobble(in); + in >> BestMatchIndex; m->gobble(in); + in >> BestMatchName; m->gobble(in); + in >> DiffstToChimera; m->gobble(in); + in >> IndexofLeftParent; m->gobble(in); + in >> IndexOfRightParent; m->gobble(in); + in >> parent1; m->gobble(in); + in >> parent2; m->gobble(in); + in >> temp1 >> temp2 >> temp3 >> temp4 >> temp5 >> temp6 >> temp7 >> temp8 >> flag; m->gobble(in); + + //find unique name + itUnique = uniqueNames.find(name); + + if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; } + else { + name = itUnique->second; + //is this name already in the file + itNames = namesInFile.find((name)); + + if (itNames == namesInFile.end()) { //no not in file + if (flag == "good") { //are you really a no?? + //is this sequence really not chimeric?? + itChimeras = chimerasInFile.find(name); + + //then you really are a no so print, otherwise skip + if (itChimeras == chimerasInFile.end()) { print = true; } + }else{ print = true; } + } + } + + if (print) { + out << index << '\t' << name << '\t' << DiffsToBestMatch << '\t' << BestMatchIndex << '\t'; + namesInFile.insert(name); + + if (BestMatchName != "Null") { + itUnique = uniqueNames.find(BestMatchName); + if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find BestMatchName "+ BestMatchName + "."); m->mothurOutEndLine(); m->control_pressed = true; } + else { out << itUnique->second << '\t'; } + }else { out << "Null" << '\t'; } + + out << DiffstToChimera << '\t' << IndexofLeftParent << '\t' << IndexOfRightParent << '\t'; + + if (parent1 != "Null") { + itUnique = uniqueNames.find(parent1); + if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find parent1 "+ parent1 + "."); m->mothurOutEndLine(); m->control_pressed = true; } + else { out << itUnique->second << '\t'; } + }else { out << "Null" << '\t'; } + + if (parent1 != "Null") { + itUnique = uniqueNames.find(parent2); + if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find parent2 "+ parent2 + "."); m->mothurOutEndLine(); m->control_pressed = true; } + else { out << itUnique->second << '\t'; } + }else { out << "Null" << '\t'; } + + out << temp1 << '\t' << temp2 << '\t' << temp3 << '\t' << temp4 << '\t' << temp5 << '\t' << temp6 << '\t' << temp7 << '\t' << temp8 << '\t' << flag << endl; + } + }else { index = m->getline(in); m->gobble(in); } + } + in.close(); + out.close(); + + m->mothurRemove(outputFileName); + rename((outputFileName+".temp").c_str(), outputFileName.c_str()); + + return total; + } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "deconvoluteResults"); + exit(1); + } +} +//********************************************************************************************************************** + + diff --git a/chimeraperseuscommand.h b/chimeraperseuscommand.h new file mode 100644 index 0000000..84563ac --- /dev/null +++ b/chimeraperseuscommand.h @@ -0,0 +1,325 @@ +#ifndef CHIMERAPERSEUSCOMMAND_H +#define CHIMERAPERSEUSCOMMAND_H + + +/* + * chimeraperseuscommand.h + * Mothur + * + * Created by westcott on 10/26/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + + + +#include "mothur.h" +#include "command.hpp" +#include "sequenceparser.h" +#include "myPerseus.h" + +/***********************************************************/ +class ChimeraPerseusCommand : public Command { +public: + ChimeraPerseusCommand(string); + ChimeraPerseusCommand(); + ~ChimeraPerseusCommand() {} + + vector setParameters(); + string getCommandName() { return "chimera.perseus"; } + string getCommandCategory() { return "Sequence Processing"; } + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/Chimera.perseus\n"; } + string getDescription() { return "detect chimeric sequences"; } + + int execute(); + void help() { m->mothurOut(getHelpString()); } + +private: + struct linePair { + int start; + int end; + linePair(int i, int j) : start(i), end(j) {} + }; + + bool abort; + string fastafile, groupfile, outputDir, namefile; + int processors; + double cutoff, alpha, beta; + + vector outputNames; + vector fastaFileNames; + vector nameFileNames; + vector groupFileNames; + + string getNamesFile(string&); + int driver(string, vector&, string, int&); + vector readFiles(string, string); + vector loadSequences(SequenceParser&, string); + int deconvoluteResults(SequenceParser&, string, string); + int driverGroups(SequenceParser&, string, string, int, int, vector); + int createProcessesGroups(SequenceParser&, string, string, vector, string, string, string); +}; + +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct perseusData { + string fastafile; + string namefile; + string groupfile; + string outputFName; + string accnos; + MothurOut* m; + int start; + int end; + int threadID, count, numChimeras; + double alpha, beta, cutoff; + vector groups; + + perseusData(){} + perseusData(double a, double b, double c, string o, string f, string n, string g, string ac, vector gr, MothurOut* mout, int st, int en, int tid) { + alpha = a; + beta = b; + cutoff = c; + fastafile = f; + namefile = n; + groupfile = g; + outputFName = o; + accnos = ac; + m = mout; + start = st; + end = en; + threadID = tid; + groups = gr; + count = 0; + numChimeras = 0; + } +}; +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else +static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ + perseusData* pDataArray; + pDataArray = (perseusData*)lpParam; + + try { + + //clears files + ofstream out, out1, out2; + pDataArray->m->openOutputFile(pDataArray->outputFName, out); out.close(); + pDataArray->m->openOutputFile(pDataArray->accnos, out1); out1.close(); + + //parse fasta and name file by group + SequenceParser* parser; + if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } + else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } + + int totalSeqs = 0; + int numChimeras = 0; + + for (int i = pDataArray->start; i < pDataArray->end; i++) { + + int start = time(NULL); if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[i] + "..."); pDataArray->m->mothurOutEndLine(); + + //vector sequences = loadSequences(parser, groups[i]); - same function below + //////////////////////////////////////////////////////////////////////////////////////// + vector thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]); + map nameMap = parser->getNameMap(pDataArray->groups[i]); + map::iterator it; + + vector sequences; + bool error = false; + + for (int j = 0; j < thisGroupsSeqs.size(); j++) { + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + + it = nameMap.find(thisGroupsSeqs[j].getName()); + if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[j].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + else { + int num = pDataArray->m->getNumNames(it->second); + sequences.push_back(seqData(thisGroupsSeqs[j].getName(), thisGroupsSeqs[j].getUnaligned(), num)); + } + } + + if (error) { pDataArray->m->control_pressed = true; } + + //sort by frequency + sort(sequences.rbegin(), sequences.rend()); + //////////////////////////////////////////////////////////////////////////////////////// + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + + //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below + //////////////////////////////////////////////////////////////////////////////////////// + string chimeraFileName = pDataArray->outputFName+pDataArray->groups[i]; + string accnosFileName = pDataArray->accnos+pDataArray->groups[i]; + + vector > correctModel(4); //could be an option in the future to input own model matrix + for(int j=0;j<4;j++){ correctModel[j].resize(4); } + + correctModel[0][0] = 0.000000; //AA + correctModel[1][0] = 11.619259; //CA + correctModel[2][0] = 11.694004; //TA + correctModel[3][0] = 7.748623; //GA + + correctModel[1][1] = 0.000000; //CC + correctModel[2][1] = 7.619657; //TC + correctModel[3][1] = 12.852562; //GC + + correctModel[2][2] = 0.000000; //TT + correctModel[3][2] = 10.964048; //TG + + correctModel[3][3] = 0.000000; //GG + + for(int k=0;k<4;k++){ + for(int j=0;jm->openOutputFile(chimeraFileName, chimeraFile); + pDataArray->m->openOutputFile(accnosFileName, accnosFile); + + Perseus myPerseus; + vector > binMatrix = myPerseus.binomial(alignLength); + + chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n"; + + vector chimeras(numSeqs, 0); + + for(int j=0;jm->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + vector restricted = chimeras; + + vector > leftDiffs(numSeqs); + vector > leftMaps(numSeqs); + vector > rightDiffs(numSeqs); + vector > rightMaps(numSeqs); + + vector singleLeft, bestLeft; + vector singleRight, bestRight; + + int bestSingleIndex, bestSingleDiff; + vector alignments(numSeqs); + + int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi; + + string dummyA, dummyB; + + if(comparisons >= 2){ + minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + int minMismatchToTrimera = numeric_limits::max(); + int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB; + + if(minMismatchToChimera >= 3 && comparisons >= 3){ + minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + } + + double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + string type; + string chimeraRefSeq; + + if(minMismatchToChimera - minMismatchToTrimera >= 3){ + type = "trimera"; + chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps); + } + else{ + type = "chimera"; + chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps); + } + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq); + double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t'; + chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t'; + chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t'; + chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t'; + + double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, pDataArray->alpha, pDataArray->beta); + + chimeraFile << probability << '\t'; + + if(probability > pDataArray->cutoff){ + chimeraFile << type << endl; + accnosFile << sequences[j].seqName << endl; + chimeras[j] = 1; + numChimeras++; + } + else{ + chimeraFile << "good" << endl; + } + + } + else{ + chimeraFile << j << '\t' << sequences[j].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl; + } + //report progress + if((j+1) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(j+1) + "\n"); } + } + + if((numSeqs) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n"); } + + chimeraFile.close(); + accnosFile.close(); + //////////////////////////////////////////////////////////////////////////////////////// + + totalSeqs += numSeqs; + + //append files + pDataArray->m->appendFiles(chimeraFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(chimeraFileName); + pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName); + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[i] + "."); pDataArray->m->mothurOutEndLine(); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + } + + pDataArray->count = totalSeqs; + delete parser; + return totalSeqs; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ChimeraUchimeCommand", "MyPerseusThreadFunction"); + exit(1); + } +} +/**************************************************************************************************/ + +#endif + +#endif + + diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp index e08909b..ee7add9 100644 --- a/chimerauchimecommand.cpp +++ b/chimerauchimecommand.cpp @@ -1514,7 +1514,7 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o #else ////////////////////////////////////////////////////////////////////////////////////////////////////// - //Windows version shared memory, so be careful when passing variables through the preClusterData struct. + //Windows version shared memory, so be careful when passing variables through the uchimeData struct. //Above fork() will clone, so memory is separate, but that's not the case with windows, ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1534,7 +1534,7 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o pDataArray.push_back(tempUchime); processIDS.push_back(i); - //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. + //MyUchimeThreadFunction is in header. It must be global or static to work with the threads. //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier hThreadArray[i-1] = CreateThread(NULL, 0, MyUchimeThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]); } diff --git a/classifyotucommand.cpp b/classifyotucommand.cpp index 7b1a04f..f36631a 100644 --- a/classifyotucommand.cpp +++ b/classifyotucommand.cpp @@ -368,7 +368,7 @@ int ClassifyOtuCommand::readTaxonomyFile() { m->gobble(in); //are there confidence scores, if so remove them - if (tax.find_first_of('(') != -1) { removeConfidences(tax); } + if (tax.find_first_of('(') != -1) { m->removeConfidences(tax); } taxMap[name] = tax; @@ -550,7 +550,7 @@ int ClassifyOtuCommand::process(ListVector* processList) { out << (i+1) << '\t' << size << '\t' << conTax << endl; string noConfidenceConTax = conTax; - removeConfidences(noConfidenceConTax); + m->removeConfidences(noConfidenceConTax); //add this bins taxonomy to summary if (basis == "sequence") { @@ -604,36 +604,6 @@ string ClassifyOtuCommand::addUnclassifieds(string tax, int maxlevel) { exit(1); } } - -/**************************************************************************************************/ -void ClassifyOtuCommand::removeConfidences(string& tax) { - try { - - string taxon; - string newTax = ""; - - while (tax.find_first_of(';') != -1) { - //get taxon - taxon = tax.substr(0,tax.find_first_of(';')); - - int pos = taxon.find_first_of('('); - if (pos != -1) { - taxon = taxon.substr(0, pos); //rip off confidence - } - - taxon += ";"; - - tax = tax.substr(tax.find_first_of(';')+1, tax.length()); - newTax += taxon; - } - - tax = newTax; - } - catch(exception& e) { - m->errorOut(e, "ClassifyOtuCommand", "removeConfidences"); - exit(1); - } -} //********************************************************************************************************************** diff --git a/classifyotucommand.h b/classifyotucommand.h index d1c1734..a0baf18 100644 --- a/classifyotucommand.h +++ b/classifyotucommand.h @@ -46,7 +46,6 @@ private: int readNamesFile(); int readTaxonomyFile(); - void removeConfidences(string&); int process(ListVector*); string addUnclassifieds(string, int); vector findConsensusTaxonomy(int, ListVector*, int&, string&); // returns the name of the "representative" taxonomy of given bin diff --git a/commandfactory.cpp b/commandfactory.cpp index 2fc6875..69bb568 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -123,6 +123,7 @@ #include "countgroupscommand.h" #include "clearmemorycommand.h" #include "summarytaxcommand.h" +#include "chimeraperseuscommand.h" /*******************************************************/ @@ -258,6 +259,7 @@ CommandFactory::CommandFactory(){ commands["chimera.check"] = "MPIEnabled"; commands["chimera.slayer"] = "MPIEnabled"; commands["chimera.uchime"] = "chimera.uchime"; + commands["chimera.perseus"] = "chimera.perseus"; commands["chimera.pintail"] = "MPIEnabled"; commands["chimera.bellerophon"] = "MPIEnabled"; commands["screen.seqs"] = "MPIEnabled"; @@ -267,6 +269,7 @@ CommandFactory::CommandFactory(){ commands["sens.spec"] = "sens.spec"; commands["seq.error"] = "seq.error"; commands["seq.error"] = "summary.tax"; + commands["quit"] = "MPIEnabled"; } @@ -424,6 +427,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "count.groups") { command = new CountGroupsCommand(optionString); } else if(commandName == "clear.memory") { command = new ClearMemoryCommand(optionString); } else if(commandName == "summary.tax") { command = new SummaryTaxCommand(optionString); } + else if(commandName == "chimera.perseus") { command = new ChimeraPerseusCommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -565,6 +569,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "count.groups") { pipecommand = new CountGroupsCommand(optionString); } else if(commandName == "clear.memory") { pipecommand = new ClearMemoryCommand(optionString); } else if(commandName == "summary.tax") { pipecommand = new SummaryTaxCommand(optionString); } + else if(commandName == "chimera.perseus") { pipecommand = new ChimeraPerseusCommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -694,6 +699,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "count.groups") { shellcommand = new CountGroupsCommand(); } else if(commandName == "clear.memory") { shellcommand = new ClearMemoryCommand(); } else if(commandName == "summary.tax") { shellcommand = new SummaryTaxCommand(); } + else if(commandName == "chimera.perseus") { shellcommand = new ChimeraPerseusCommand(); } else { shellcommand = new NoCommand(); } return shellcommand; diff --git a/getlineagecommand.cpp b/getlineagecommand.cpp index 195c822..b4e41e5 100644 --- a/getlineagecommand.cpp +++ b/getlineagecommand.cpp @@ -564,7 +564,8 @@ int GetLineageCommand::readTax(){ if (hasConPos != string::npos) { taxonsHasConfidence[i] = true; searchTaxons[i] = getTaxons(listOfTaxons[i]); - noConfidenceTaxons[i] = removeConfidences(listOfTaxons[i]); + noConfidenceTaxons[i] = listOfTaxons[i]; + m->removeConfidences(noConfidenceTaxons[i]); } } @@ -584,7 +585,8 @@ int GetLineageCommand::readTax(){ if (!taxonsHasConfidence[j]) { int hasConfidences = tax.find_first_of('('); if (hasConfidences != string::npos) { - newtax = removeConfidences(tax); + newtax = tax; + m->removeConfidences(newtax); } int pos = newtax.find(noConfidenceTaxons[j]); @@ -613,7 +615,8 @@ int GetLineageCommand::readTax(){ string noNewTax = tax; int hasConfidences = tax.find_first_of('('); if (hasConfidences != string::npos) { - noNewTax = removeConfidences(tax); + noNewTax = tax; + m->removeConfidences(noNewTax); } int pos = noNewTax.find(noConfidenceTaxons[j]); @@ -725,29 +728,6 @@ vector< map > GetLineageCommand::getTaxons(string tax) { exit(1); } } -/**************************************************************************************************/ -string GetLineageCommand::removeConfidences(string tax) { - try { - - string taxon = ""; - int taxLength = tax.length(); - for(int i=0;ierrorOut(e, "GetLineageCommand", "removeConfidences"); - exit(1); - } -} //********************************************************************************************************************** //alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name int GetLineageCommand::readAlign(){ diff --git a/getlineagecommand.h b/getlineagecommand.h index 4c9514d..a677061 100644 --- a/getlineagecommand.h +++ b/getlineagecommand.h @@ -44,7 +44,6 @@ class GetLineageCommand : public Command { int readAlign(); int readList(); int readTax(); - string removeConfidences(string); vector< map > getTaxons(string); }; diff --git a/metastatscommand.cpp b/metastatscommand.cpp index b493c96..975924b 100644 --- a/metastatscommand.cpp +++ b/metastatscommand.cpp @@ -394,7 +394,7 @@ int MetaStatsCommand::driver(int start, int num, vector& th //get set names string setA = namesOfGroupCombos[c][0]; string setB = namesOfGroupCombos[c][1]; - + cout << setA << '\t' << setB << endl; //get filename string outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + thisLookUp[0]->getLabel() + "." + setA + "-" + setB + ".metastats"; outputNames.push_back(outputFileName); outputTypes["metastats"].push_back(outputFileName); @@ -425,7 +425,8 @@ int MetaStatsCommand::driver(int start, int num, vector& th } } - //for (int i = 0; i < subset.size(); i++) { cout << subset[i]->getGroup() << endl; } + for (int i = 0; i < subset.size(); i++) { cout << designMap->getGroup(subset[i]->getGroup()) << endl; } + cout << setACount << endl; if ((setACount == 0) || (setBCount == 0)) { m->mothurOut("Missing shared info for " + setA + " or " + setB + ". Skipping comparison."); m->mothurOutEndLine(); diff --git a/mothurmetastats.cpp b/mothurmetastats.cpp index 6a834e9..9ba4050 100644 --- a/mothurmetastats.cpp +++ b/mothurmetastats.cpp @@ -55,10 +55,10 @@ int MothurMetastats::runMetastats(string outputFileName, vector< vector } //total for first grouping - for (int i = 0; i < (secondGroupingStart-1); i++) { total1 += total[i]; } + for (int i = 0; i < secondGroupingStart; i++) { total1 += total[i]; } //total for second grouping - for (int i = (secondGroupingStart-1); i < column; i++) { total2 += total[i]; } + for (int i = secondGroupingStart; i < column; i++) { total2 += total[i]; } //Creates the ratios by first finding the minimum of totals double min = total[0]; @@ -105,15 +105,15 @@ int MothurMetastats::runMetastats(string outputFileName, vector< vector if (m->control_pressed) { return 1; } // Start the calculations. - if ( (column == 2) || ((secondGroupingStart-1) < 8) || ((column-secondGroupingStart+1) < 8) ){ + if ( (column == 2) || (secondGroupingStart < 8) || ((column-secondGroupingStart) < 8) ){ vector fish; fish.resize(row, 0.0); vector fish2; fish2.resize(row, 0.0); for(int i = 0; i < row; i++){ - for(int j = 0; j < (secondGroupingStart-1); j++) { fish[i] += data[i][j]; } - for(int j = (secondGroupingStart-1); j < column; j++) { fish2[i] += data[i][j]; } + for(int j = 0; j < secondGroupingStart; j++) { fish[i] += data[i][j]; } + for(int j = secondGroupingStart; j < column; j++) { fish2[i] += data[i][j]; } //vector tempData; tempData.resize(4, 0.0); double f11, f12, f21, f22; @@ -148,12 +148,12 @@ int MothurMetastats::runMetastats(string outputFileName, vector< vector for(int i = 0; i < row; i++){ - for(int j = 0; j < (secondGroupingStart-1); j++){ sparse[i] += data[i][j]; } - if(sparse[i] < (double)(secondGroupingStart-1)){ c++; } + for(int j = 0; j < secondGroupingStart; j++) { sparse[i] += data[i][j]; } + if(sparse[i] < (double)secondGroupingStart) { c++; } // ?<= for col - for(int j = (secondGroupingStart-1); j < column; j++){ sparse2[i] += data[i][j]; } - if( (sparse2[i] < (double)(column-secondGroupingStart+1))) { c++; } + for(int j = secondGroupingStart; j < column; j++) { sparse2[i] += data[i][j]; } + if( (sparse2[i] < (double)(column-secondGroupingStart))) { c++; } if (c == 2) { c=0; @@ -190,11 +190,11 @@ int MothurMetastats::runMetastats(string outputFileName, vector< vector for (int j = 0; j < row; j++){ if (m->control_pressed) { return 1; } - for (int i = 1; i <= (secondGroupingStart-1); i++){ temp[j][0] += data[j][i-1]; } - temp[j][0] /= (double)(secondGroupingStart-1); + for (int i = 0; i < secondGroupingStart; i++){ temp[j][0] += data[j][i]; } + temp[j][0] /= (double)secondGroupingStart; - for(int i = secondGroupingStart; i <= column; i++){ temp[j][1] += data[j][i-1]; } - temp[j][1] /= (double)(column-secondGroupingStart+1); + for(int i = secondGroupingStart; i < column; i++){ temp[j][1] += data[j][i]; } + temp[j][1] /= (double)(column-secondGroupingStart); } for(int i = 0; i < row; i++){ @@ -280,14 +280,14 @@ int MothurMetastats::start(vector& Imatrix, int secondGroupingStart, vec storage[i][0]=C1[i][0]; C1[i][1]=tool[i+row+row]; // var group 1 storage[i][1]=C1[i][1]; - C1[i][2]=C1[i][1]/(secondGroupingStart-1); + C1[i][2]=C1[i][1]/(secondGroupingStart); storage[i][2]=sqrt(C1[i][2]); C2[i][0]=tool[i+row]; // mean group 2 storage[i][4]=C2[i][0]; C2[i][1]=tool[i+row+row+row]; // var group 2 storage[i][5]=C2[i][1]; - C2[i][2]=C2[i][1]/(column-secondGroupingStart+1); + C2[i][2]=C2[i][1]/(column-secondGroupingStart); storage[i][6]=sqrt(C2[i][2]); } @@ -314,7 +314,7 @@ int MothurMetastats::meanvar(vector& pmatrix, int secondGroupingStart, v vector var; var.resize(row, 0.0); vector var2; var2.resize(row, 0.0); - double a = secondGroupingStart-1; + double a = secondGroupingStart; double b = column - a; int m = a * row; int n = row * column; @@ -459,11 +459,11 @@ int MothurMetastats::calc_twosample_ts(vector& Pmatrix, int secondGroupi if (m->control_pressed) { return 0; } C1[i][0]=tool[i]; C1[i][1]=tool[i+row+row]; - C1[i][2]=C1[i][1]/(secondGroupingStart-1); + C1[i][2]=C1[i][1]/(secondGroupingStart); C2[i][0]=tool[i+row]; C2[i][1]=tool[i+row+row+row]; // var group 2 - C2[i][2]=C2[i][1]/(column-secondGroupingStart+1); + C2[i][2]=C2[i][1]/(column-secondGroupingStart); } for (int i = 0; i < row; i++){ diff --git a/mothurout.cpp b/mothurout.cpp index 90d6f37..32f3b85 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -2020,6 +2020,46 @@ bool MothurOut::isContainingOnlyDigits(string input) { } } /**************************************************************************************************/ +int MothurOut::removeConfidences(string& tax) { + try { + + string taxon; + string newTax = ""; + + while (tax.find_first_of(';') != -1) { + + if (control_pressed) { return 0; } + + //get taxon + taxon = tax.substr(0,tax.find_first_of(';')); + + int pos = taxon.find_last_of('('); + if (pos != -1) { + //is it a number? + int pos2 = taxon.find_last_of(')'); + if (pos2 != -1) { + string confidenceScore = taxon.substr(pos+1, (pos2-(pos+1))); + if (isContainingOnlyDigits(confidenceScore)) { + taxon = taxon.substr(0, pos); //rip off confidence + } + } + } + taxon += ";"; + + tax = tax.substr(tax.find_first_of(';')+1, tax.length()); + newTax += taxon; + } + + tax = newTax; + + return 0; + } + catch(exception& e) { + errorOut(e, "MothurOut", "removeConfidences"); + exit(1); + } +} +/**************************************************************************************************/ diff --git a/mothurout.h b/mothurout.h index 0c45a30..785263b 100644 --- a/mothurout.h +++ b/mothurout.h @@ -84,6 +84,7 @@ class MothurOut { int readNames(string, map >&); int readNames(string, vector&, map&); void mothurRemove(string); + //searchs and checks bool checkReleaseVersion(ifstream&, string); @@ -105,6 +106,7 @@ class MothurOut { void splitAtDash(string&, set&); void splitAtDash(string&, vector&); void splitAtChar(string&, vector&, char); + int removeConfidences(string&); //math operation int factorial(int num); diff --git a/myPerseus.cpp b/myPerseus.cpp new file mode 100644 index 0000000..b342393 --- /dev/null +++ b/myPerseus.cpp @@ -0,0 +1,1073 @@ +/* + * myPerseus.cpp + * + * + * Created by Pat Schloss on 9/5/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "myPerseus.h" + +/**************************************************************************************************/ +int PERSEUSMAXINT = numeric_limits::max(); +/**************************************************************************************************/ + +vector > Perseus::binomial(int maxOrder){ + try { + vector > binomial(maxOrder+1); + + for(int i=0;i<=maxOrder;i++){ + binomial[i].resize(maxOrder+1); + binomial[i][0]=1; + binomial[0][i]=0; + } + binomial[0][0]=1; + + binomial[1][0]=1; + binomial[1][1]=1; + + for(int i=2;i<=maxOrder;i++){ + binomial[1][i]=0; + } + + for(int i=2;i<=maxOrder;i++){ + for(int j=1;j<=maxOrder;j++){ + if(i==j){ binomial[i][j]=1; } + if(j>i) { binomial[i][j]=0; } + else { binomial[i][j]=binomial[i-1][j-1]+binomial[i-1][j]; } + } + } + + return binomial; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "binomial"); + exit(1); + } +} + +/**************************************************************************************************/ +double Perseus::basicPairwiseAlignSeqs(string query, string reference, string& qAlign, string& rAlign, pwModel model){ + try { + double GAP = model.GAP_OPEN; + double MATCH = model.MATCH; + double MISMATCH = model.MISMATCH; + + int queryLength = query.size(); + int refLength = reference.size(); + + vector > alignMatrix(queryLength + 1); + vector > alignMoves(queryLength + 1); + + for(int i=0;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[i].resize(refLength + 1, 0); + alignMoves[i].resize(refLength + 1, 'x'); + } + + for(int i=0;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[i][0] = GAP * i; + alignMoves[i][0] = 'u'; + } + + for(int i=0;i<=refLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[0][i] = GAP * i; + alignMoves[0][i] = 'l'; + } + + for(int i=1;i<=queryLength;i++){ + + if (m->control_pressed) { return 0; } + + for(int j=1;j<=refLength;j++){ + + double nogapScore; + if(query[i-1] == reference[j-1]){ nogapScore = alignMatrix[i-1][j-1] + MATCH; } + else { nogapScore = alignMatrix[i-1][j-1] + MISMATCH; } + + double leftScore; + if(i == queryLength) { leftScore = alignMatrix[i][j-1]; } + else { leftScore = alignMatrix[i][j-1] + GAP; } + + + double upScore; + if(j == refLength) { upScore = alignMatrix[i-1][j]; } + else { upScore = alignMatrix[i-1][j] + GAP; } + + if(nogapScore > leftScore){ + if(nogapScore > upScore){ + alignMoves[i][j] = 'd'; + alignMatrix[i][j] = nogapScore; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = upScore; + } + } + else{ + if(leftScore > upScore){ + alignMoves[i][j] = 'l'; + alignMatrix[i][j] = leftScore; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = upScore; + } + } + } + } + + int i = queryLength; + int j = refLength; + + qAlign = ""; + rAlign = ""; + + int diffs = 0; + int length = 0; + + while(i > 0 && j > 0){ + + if (m->control_pressed) { return 0; } + + if(alignMoves[i][j] == 'd'){ + qAlign = query[i-1] + qAlign; + rAlign = reference[j-1] + rAlign; + + if(query[i-1] != reference[j-1]){ diffs++; } + length++; + + i--; + j--; + } + else if(alignMoves[i][j] == 'u'){ + qAlign = query[i-1] + qAlign; + + if(j != refLength) { rAlign = '-' + rAlign; diffs++; length++; } + else { rAlign = '.' + rAlign; } + i--; + } + else if(alignMoves[i][j] == 'l'){ + rAlign = reference[j-1] + rAlign; + + if(i != queryLength){ qAlign = '-' + qAlign; diffs++; length++; } + else { qAlign = '.' + qAlign; } + j--; + } + } + + while(i>0){ + + if (m->control_pressed) { return 0; } + + rAlign = '.' + rAlign; + qAlign = query[i-1] + qAlign; + i--; + } + + while(j>0){ + + if (m->control_pressed) { return 0; } + + rAlign = reference[j-1] + rAlign; + qAlign = '.' + qAlign; + j--; + } + + + + return double(diffs)/double(length); + } + catch(exception& e) { + m->errorOut(e, "Perseus", "basicPairwiseAlignSeqs"); + exit(1); + } + +} +/**************************************************************************************************/ +int Perseus::getDiffs(string qAlign, string rAlign, vector& leftDiffs, vector& leftMap, vector& rightDiffs, vector& rightMap){ + try { + int alignLength = qAlign.length(); + + int lDiffs = 0; + int lCount = 0; + for(int l=0;lcontrol_pressed) { return 0; } + + if(qAlign[l] == '-'){ + lDiffs++; + } + else if(qAlign[l] != '.'){ + + if(rAlign[l] == '-'){ + lDiffs++; + } + else if(qAlign[l] != rAlign[l] && rAlign[l] != '.'){ + lDiffs++; + } + leftDiffs[lCount] = lDiffs; + leftMap[lCount] = l; + + lCount++; + } + } + + int rDiffs = 0; + int rCount = 0; + for(int l=alignLength-1;l>=0;l--){ + + if (m->control_pressed) { return 0; } + + if(qAlign[l] == '-'){ + rDiffs++; + } + else if(qAlign[l] != '.'){ + + if(rAlign[l] == '-'){ + rDiffs++; + } + else if(qAlign[l] != rAlign[l] && rAlign[l] != '.'){ + rDiffs++; + } + + rightDiffs[rCount] = rDiffs; + rightMap[rCount] = l; + rCount++; + + } + + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "getDiffs"); + exit(1); + } +} +/**************************************************************************************************/ +int Perseus::getLastMatch(char direction, vector >& alignMoves, int i, int j, string& seqA, string& seqB){ + try { + char nullReturn = -1; + + while(i>=1 && j>=1){ + + if (m->control_pressed) { return 0; } + + if(direction == 'd'){ + if(seqA[i-1] == seqB[j-1]) { return seqA[i-1]; } + else { return nullReturn; } + } + + else if(direction == 'l') { j--; } + else { i--; } + + direction = alignMoves[i][j]; + } + + return nullReturn; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "getLastMatch"); + exit(1); + } +} + +/**************************************************************************************************/ + +int Perseus::toInt(char b){ + try { + if(b == 'A') { return 0; } + else if(b == 'C') { return 1; } + else if(b == 'T') { return 2; } + else if(b == 'G') { return 3; } + else { m->mothurOut("[ERROR]: " + toString(b) + " is not ATGC."); m->mothurOutEndLine(); return -1; } + } + catch(exception& e) { + m->errorOut(e, "Perseus", "toInt"); + exit(1); + } +} + +/**************************************************************************************************/ + +double Perseus::modeledPairwiseAlignSeqs(string query, string reference, string& qAlign, string& rAlign, vector >& correctMatrix){ + try { + int queryLength = query.size(); + int refLength = reference.size(); + + vector > alignMatrix(queryLength + 1); + vector > alignMoves(queryLength + 1); + + for(int i=0;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[i].resize(refLength + 1, 0); + alignMoves[i].resize(refLength + 1, 'x'); + } + + for(int i=0;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[i][0] = 15.0 * i; + alignMoves[i][0] = 'u'; + } + + for(int i=0;i<=refLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[0][i] = 15.0 * i; + alignMoves[0][i] = 'l'; + } + + for(int i=1;i<=queryLength;i++){ + + if (m->control_pressed) { return 0; } + + for(int j=1;j<=refLength;j++){ + + double nogap; + nogap = alignMatrix[i-1][j-1] + correctMatrix[toInt(query[i-1])][toInt(reference[j-1])]; + + double gap; + + double left; + if(i == queryLength){ //terminal gap + left = alignMatrix[i][j-1]; + } + else{ + if(reference[j-1] == getLastMatch('l', alignMoves, i, j, query, reference)){ + gap = 4.0; + } + else{ + gap = 15.0; + } + + left = alignMatrix[i][j-1] + gap; + } + + double up; + if(j == refLength){ //terminal gap + up = alignMatrix[i-1][j]; + } + else{ + + if(query[i-1] == getLastMatch('u', alignMoves, i, j, query, reference)){ + gap = 4.0; + } + else{ + gap = 15.0; + } + + up = alignMatrix[i-1][j] + gap; + } + + + if(nogap < left){ + if(nogap < up){ + alignMoves[i][j] = 'd'; + alignMatrix[i][j] = nogap; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = up; + } + } + else{ + if(left < up){ + alignMoves[i][j] = 'l'; + alignMatrix[i][j] = left; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = up; + } + } + } + } + + int i = queryLength; + int j = refLength; + + int alignLength = 0; + + while(i > 0 && j > 0){ + + if (m->control_pressed) { return 0; } + + if(alignMoves[i][j] == 'd'){ + qAlign = query[i-1] + qAlign; + rAlign = reference[j-1] + rAlign; + alignLength++; + i--; + j--; + } + else if(alignMoves[i][j] == 'u'){ + if(j != refLength){ + qAlign = query[i-1] + qAlign; + rAlign = '-' + rAlign; + alignLength++; + } + + i--; + } + else if(alignMoves[i][j] == 'l'){ + if(i != queryLength){ + qAlign = '-' + qAlign; + rAlign = reference[j-1] + rAlign; + alignLength++; + } + + j--; + } + } + + return alignMatrix[queryLength][refLength] / (double)alignLength; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "modeledPairwiseAlignSeqs"); + exit(1); + } +} + +/**************************************************************************************************/ +int Perseus::getAlignments(int curSequenceIndex, vector sequences, vector& alignments, vector >& leftDiffs, vector >& leftMaps, vector >& rightDiffs, vector >& rightMaps, int& bestRefSeq, int& bestRefDiff, vector& restricted){ + try { + int numSeqs = sequences.size(); + //int bestSequenceMismatch = PERSEUSMAXINT; + + string curSequence = sequences[curSequenceIndex].sequence; + int curFrequency = sequences[curSequenceIndex].frequency; + + bestRefSeq = -1; + + int bestIndex = -1; + int bestDiffs = PERSEUSMAXINT; + int comparisons = 0; + + pwModel model(0, -1, -1.5); + + for(int i=0;icontrol_pressed) { return 0; } + + if(i != curSequenceIndex && restricted[i] != 1 && sequences[i].frequency >= 2 * curFrequency){ + string refSequence = sequences[i].sequence; + + leftDiffs[i].assign(curSequence.length(), 0); + leftMaps[i].assign(curSequence.length(), 0); + rightDiffs[i].assign(curSequence.length(), 0); + rightMaps[i].assign(curSequence.length(), 0); + + basicPairwiseAlignSeqs(curSequence, refSequence, alignments[i].query, alignments[i].reference, model); + + + getDiffs(alignments[i].query, alignments[i].reference, leftDiffs[i], leftMaps[i], rightDiffs[i], rightMaps[i]); + + int diffs = rightDiffs[i][curSequence.length()-1]; + + if(diffs < bestDiffs){ + bestDiffs = diffs; + bestIndex = i; + } + comparisons++; + restricted[i] = 0; + } + else{ + restricted[i] = 1; + } + } + + bestRefSeq = bestIndex; + bestRefDiff = bestDiffs; + + return comparisons; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "getAlignments"); + exit(1); + } +} +/**************************************************************************************************/ +int Perseus::getChimera(vector sequences, + vector >& leftDiffs, + vector >& rightDiffs, + int& leftParent, + int& rightParent, + int& breakPoint, + vector& singleLeft, + vector& bestLeft, + vector& singleRight, + vector& bestRight, + vector restricted){ + try { + int numRefSeqs = restricted.size(); + int seqLength = leftDiffs[0].size(); + + singleLeft.resize(seqLength, PERSEUSMAXINT); + bestLeft.resize(seqLength, -1); + + for(int l=0;lcontrol_pressed) { return 0; } + + for(int i=0;i sequences[bestLeft[l]].frequency)){ + singleLeft[l] = leftDiffs[i][l]; + bestLeft[l] = i; + } + } + } + } + + singleRight.resize(seqLength, PERSEUSMAXINT); + bestRight.resize(seqLength, -1); + + for(int l=0;lcontrol_pressed) { return 0; } + + for(int i=0;i sequences[bestRight[l]].frequency)){ + singleRight[l] = rightDiffs[i][l]; + bestRight[l] = i; + } + } + } + } + + + + int bestChimeraMismatches = PERSEUSMAXINT; + leftParent = -1; + rightParent = -1; + breakPoint = -1; + + for(int l=0;lcontrol_pressed) { return 0; } + + int chimera = singleLeft[l] + singleRight[seqLength - l - 2]; + if(chimera < bestChimeraMismatches){ + bestChimeraMismatches = chimera; + breakPoint = l; + leftParent = bestLeft[l]; + rightParent = bestRight[seqLength - l - 2]; + } + } + + return bestChimeraMismatches; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "getChimera"); + exit(1); + } +} + +/**************************************************************************************************/ + +string Perseus::stitchBimera(vector& alignments, int leftParent, int rightParent, int breakPoint, vector >& leftMaps, vector >& rightMaps){ + try { + int breakLeft = leftMaps[leftParent][breakPoint]; + int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2]; + + string left = alignments[leftParent].reference; + string right = alignments[rightParent].reference; + string chimera = ""; + + for(int i=0;i<=breakLeft;i++){ + + if (m->control_pressed) { return 0; } + + if(left[i] != '-' && left[i] != '.'){ + chimera += left[i]; + } + } + + + for(int i=breakRight;icontrol_pressed) { return 0; } + + if(right[i] != '-' && right[i] != '.'){ + chimera += right[i]; + } + } + + return chimera; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "stitchBimera"); + exit(1); + } +} +/**************************************************************************************************/ +int Perseus::getTrimera(vector& sequences, + vector >& leftDiffs, + int& leftParent, + int& middleParent, + int& rightParent, + int& breakPointA, + int& breakPointB, + vector& singleLeft, + vector& bestLeft, + vector& singleRight, + vector& bestRight, + vector restricted){ + try { + int numRefSeqs = leftDiffs.size(); + int alignLength = leftDiffs[0].size(); + int bestTrimeraMismatches = PERSEUSMAXINT; + + leftParent = -1; + middleParent = -1; + rightParent = -1; + + breakPointA = -1; + breakPointB = -1; + + vector > minDelta(alignLength); + vector > minDeltaSeq(alignLength); + + for(int i=0;icontrol_pressed) { return 0; } + minDelta[i].assign(alignLength, PERSEUSMAXINT); + minDeltaSeq[i].assign(alignLength, -1); + } + + for(int x=0;xcontrol_pressed) { return 0; } + + if(restricted[i] == 0){ + int delta = leftDiffs[i][y] - leftDiffs[i][x]; + + if(delta < minDelta[x][y] || delta == minDelta[x][y] && sequences[i].frequency > sequences[minDeltaSeq[x][y]].frequency){ + minDelta[x][y] = delta; + minDeltaSeq[x][y] = i; + } + } + } + minDelta[x][y] += singleLeft[x] + singleRight[alignLength - y - 2]; + + if(minDelta[x][y] < bestTrimeraMismatches){ + bestTrimeraMismatches = minDelta[x][y]; + + breakPointA = x; + breakPointB = y; + + leftParent = bestLeft[x]; + middleParent = minDeltaSeq[x][y]; + rightParent = bestRight[alignLength - y - 2]; + } + } + } + + return bestTrimeraMismatches; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "getTrimera"); + exit(1); + } +} + +/**************************************************************************************************/ + +string Perseus::stitchTrimera(vector alignments, int leftParent, int middleParent, int rightParent, int breakPointA, int breakPointB, vector >& leftMaps, vector >& rightMaps){ + try { + int p1SplitPoint = leftMaps[leftParent][breakPointA]; + int p2SplitPoint = leftMaps[middleParent][breakPointB]; + int p3SplitPoint = rightMaps[rightParent][rightMaps[rightParent].size() - breakPointB - 2]; + + string chimeraRefSeq; + for(int i=0;i<=p1SplitPoint;i++){ + if (m->control_pressed) { return chimeraRefSeq; } + if(alignments[leftParent].reference[i] != '-' && alignments[leftParent].reference[i] != '.'){ + chimeraRefSeq += alignments[leftParent].reference[i]; + } + } + + for(int i=p1SplitPoint+1;i<=p2SplitPoint;i++){ + if (m->control_pressed) { return chimeraRefSeq; } + if(alignments[middleParent].reference[i] != '-' && alignments[middleParent].reference[i] != '.'){ + chimeraRefSeq += alignments[middleParent].reference[i]; + } + } + + for(int i=p3SplitPoint;icontrol_pressed) { return chimeraRefSeq; } + if(alignments[rightParent].reference[i] != '-' && alignments[rightParent].reference[i] != '.'){ + chimeraRefSeq += alignments[rightParent].reference[i]; + } + } + + return chimeraRefSeq; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "stitchTrimera"); + exit(1); + } +} + +/**************************************************************************************************/ + +int Perseus::threeWayAlign(string query, string parent1, string parent2, string& qAlign, string& aAlign, string& bAlign){ + try { + pwModel model(1.0, -1.0, -5.0); + + string qL, rL; + string qR, rR; + + basicPairwiseAlignSeqs(query, parent1, qL, rL, model); + basicPairwiseAlignSeqs(query, parent2, qR, rR, model); + + int lLength = qL.length(); + int rLength = qR.length(); + + string qLNew, rLNew; + string qRNew, rRNew; + + int lIndex = 0; + int rIndex = 0; + + while(lIndexcontrol_pressed) { return 0; } + + if(qL[lIndex] == qR[rIndex]){ + qLNew += qL[lIndex]; + rLNew += rL[lIndex]; + lIndex++; + + qRNew += qR[rIndex]; + rRNew += rR[rIndex]; + rIndex++; + } + else if(qL[lIndex] == '-' || qL[lIndex] == '.'){ + //insert a gap into the right sequences + qLNew += qL[lIndex]; + rLNew += rL[lIndex]; + lIndex++; + + if(rIndex != rLength){ + qRNew += '-'; + rRNew += '-'; + } + else{ + qRNew += '.'; + rRNew += '.'; + } + } + else if(qR[rIndex] == '-' || qR[rIndex] == '.'){ + //insert a gap into the left sequences + qRNew += qR[rIndex]; + rRNew += rR[rIndex]; + rIndex++; + + + if(lIndex != lLength){ + qLNew += '-'; + rLNew += '-'; + } + else{ + qLNew += '.'; + rLNew += '.'; + } + + } + } + + qAlign = qLNew; + aAlign = rLNew; + bAlign = rRNew; + + bool qStart = 0; + bool aStart = 0; + bool bStart = 0; + + for(int i=0;icontrol_pressed) { return 0; } + + if(qStart == 0){ + if(qAlign[i] == '-') { qAlign[i] = '.'; } + else { qStart = 1; } + } + if(aStart == 0){ + if(aAlign[i] == '-') { aAlign[i] = '.'; } + else { aStart = 1; } + } + if(bStart == 0){ + if(bAlign[i] == '-') { bAlign[i] = '.'; } + else { bStart = 1; } + } + if(aStart == 1 && bStart == 1 && qStart == 1){ + break; + } + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "threeWayAlign"); + exit(1); + } +} + +/**************************************************************************************************/ + +double Perseus::calcLoonIndex(string query, string parent1, string parent2, int breakPoint, vector >& binMatrix){ + try { + string queryAln, leftParentAln, rightParentAln; + threeWayAlign(query, parent1, parent2, queryAln, leftParentAln, rightParentAln); + + int alignLength = queryAln.length(); + + int endPos = alignLength; + for(int i=alignLength-1;i>=0; i--){ + if(queryAln[i] != '.' && leftParentAln[i] != '.' && rightParentAln[i] != '.'){ + endPos = i + 1; + break; + } + } + + int diffToLeftCount = 0; + vector diffToLeftMap(alignLength, 0); + + int diffToRightCount = 0; + vector diffToRightMap(alignLength, 0); + + for(int i=0;icontrol_pressed) { return 0; } + + if(queryAln[i] != leftParentAln[i]){ + diffToLeftMap[diffToLeftCount] = i; + diffToLeftCount++; + } + + if(queryAln[i] != rightParentAln[i]){ + diffToRightMap[diffToRightCount] = i; + diffToRightCount++; + } + } + + + + diffToLeftMap[diffToLeftCount] = endPos; + diffToRightMap[diffToRightCount] = endPos; + + int indexL = 0; + int indexR = 0; + int indexS = 0; + + vector diffs; + vector splits; + + splits.push_back(-1); + diffs.push_back(diffToRightCount); + indexS++; + + while(indexL < diffToLeftCount || indexR < diffToRightCount){ + + if (m->control_pressed) { return 0; } + + if(diffToLeftMap[indexL] <= diffToRightMap[indexR]){ + diffs.push_back(diffs[indexS - 1] + 1); + splits.push_back(diffToLeftMap[indexL]); + + indexL++; + indexS++; + } + else if(diffToLeftMap[indexL] > diffToRightMap[indexR]) { + diffs.push_back(diffs[indexS - 1] - 1); + splits.push_back(diffToRightMap[indexR]); + + indexR++; + indexS++; + } + } + + int minDiff = PERSEUSMAXINT; + int minIndex = -1; + for(int i=0;icontrol_pressed) { return 0; } + + if(diffs[i] < minDiff){ + minDiff = diffs[i]; + minIndex = i; + } + } + + int splitPos = endPos; + if(minIndex < indexS - 1){ + splitPos = (splits[minIndex]+splits[minIndex+1]) / 2; + } + + int diffToChimera = 0; + int leftDiffToP1 = 0; + int rightDiffToP1 = 0; + int leftDiffToP2 = 0; + int rightDiffToP2 = 0; + + for(int i=0;icontrol_pressed) { return 0; } + + char bQuery = queryAln[i]; + char bP1 = leftParentAln[i]; + char bP2 = rightParentAln[i]; + + char bConsensus = bQuery; + if(bP1 == bP2){ bConsensus = bP1; } + + if(bConsensus != bQuery){ + diffToChimera++; + } + + if(bConsensus != bP1){ + if(i <= splitPos){ + leftDiffToP1++; + } + else{ + rightDiffToP1++; + } + } + if(bConsensus != bP2){ + if(i <= splitPos){ + leftDiffToP2++; + } + else{ + rightDiffToP2++; + } + } + } + + + int diffToClosestParent, diffToFurtherParent; + int xA, xB, yA, yB; + double aFraction, bFraction; + + if(diffToLeftCount <= diffToRightCount){ //if parent 1 is closer + + diffToClosestParent = leftDiffToP1 + rightDiffToP1; + xA = leftDiffToP1; + xB = rightDiffToP1; + + diffToFurtherParent = leftDiffToP2 + rightDiffToP2; + yA = leftDiffToP2; + yB = rightDiffToP2; + + aFraction = double(splitPos + 1)/(double) endPos; + bFraction = 1 - aFraction; + + } + else{ //if parent 2 is closer + + diffToClosestParent = leftDiffToP2 + rightDiffToP2; + xA = rightDiffToP2; + xB = leftDiffToP2; + + diffToFurtherParent = leftDiffToP1 + rightDiffToP1; + yA = rightDiffToP1; + yB = leftDiffToP1; + + bFraction = double(splitPos + 1)/(double) endPos; + aFraction = 1 - bFraction; + + } + + double loonIndex = 0; + + int totalDifference = diffToClosestParent + diffToChimera; + + if(totalDifference > 0){ + double prob = 0; + + for(int i=diffToClosestParent;i<=totalDifference;i++){ + prob += binMatrix[totalDifference][i] * pow(0.50, i) * pow(0.50, totalDifference - i); + } + loonIndex += -log(prob); + } + + if(diffToFurtherParent > 0){ + double prob = 0; + + for(int i=yA;i<=diffToFurtherParent;i++){ + prob += binMatrix[diffToFurtherParent][i] * pow(aFraction, i) * pow(1-aFraction, diffToFurtherParent - i); + } + loonIndex += -log(prob); + } + + if(diffToClosestParent > 0){ + double prob = 0; + + for(int i=xB;i<=diffToClosestParent;i++){ + prob += binMatrix[diffToClosestParent][i] * pow(bFraction, i) * pow(1-bFraction, diffToClosestParent - i); + } + loonIndex += -log(prob); + } + + return loonIndex; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "calcLoonIndex"); + exit(1); + } +} + +/**************************************************************************************************/ + +double Perseus::calcBestDistance(string query, string reference){ + try { + int alignLength = query.length(); + int mismatch = 0; + int counter = 0; + + for(int i=0;icontrol_pressed) { return 0; } + + if((query[i] != '.' || reference[i] != '.') && (query[i] != '-' && reference[i] != '-')){ + if(query[i] != reference[i]){ mismatch++; } + counter++; + } + } + + return (double)mismatch / (double)counter; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "calcBestDistance"); + exit(1); + } +} + +/**************************************************************************************************/ + +double Perseus::classifyChimera(double singleDist, double cIndex, double loonIndex, double alpha, double beta){ + try { + double difference = cIndex - singleDist; //y + double probability; + + if(cIndex >= 0.15 || difference > 0.00){ + probability = 0.0000; + } + else{ + probability = 1.0 / (1.0 + exp(-(alpha + beta * loonIndex))); + } + + return probability; + } + catch(exception& e) { + m->errorOut(e, "Perseus", "classifyChimera"); + exit(1); + } +} +/**************************************************************************************************/ diff --git a/myPerseus.h b/myPerseus.h new file mode 100644 index 0000000..93ef8ae --- /dev/null +++ b/myPerseus.h @@ -0,0 +1,85 @@ +#ifndef MOTHURPERSEUS +#define MOTHURPERSEUS + +/* + * myPerseus.h + * + * + * Created by Pat Schloss on 9/5/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + + +#include "mothurout.h" + +/**************************************************************************************************/ +struct seqData { + + seqData(string name, string seq, int freq) : seqName(name), sequence(seq), frequency(freq) { } + + bool operator<( seqData const& rhs ) const { + + bool verdict = 0; + + if(frequency < rhs.frequency){ + verdict = 1; + } + else if(frequency == rhs.frequency){ + verdict = (seqName > rhs.seqName); + } + + return verdict; + } + + string seqName; + string sequence; + int frequency; +}; +/**************************************************************************************************/ +struct pwModel { + pwModel(double m, double mm, double g): MATCH(m), MISMATCH(mm), GAP_OPEN(g) {;} + double MATCH; + double MISMATCH; + double GAP_OPEN; +}; +/**************************************************************************************************/ +struct pwAlign { + pwAlign(): query(""), reference(""){} + pwAlign(string q, string r): query(q), reference(r){} + string query; + string reference; + +}; +/**************************************************************************************************/ +class Perseus { + +public: + Perseus() { m = MothurOut::getInstance(); } + ~Perseus() {} + + vector > binomial(int); + double modeledPairwiseAlignSeqs(string, string, string&, string&, vector >&); + int getAlignments(int, vector, vector&, vector >& , vector >&, vector >&, vector >&, int&, int&, vector&); + int getChimera(vector,vector >&, vector >&,int&, int&, int&,vector&, vector&, vector&, vector&, vector); + string stitchBimera(vector&, int, int, int, vector >&, vector >&); + int getTrimera(vector&, vector >&, int&, int&, int&, int&, int&, vector&, vector&, vector&, vector&, vector); + string stitchTrimera(vector, int, int, int, int, int, vector >&, vector >&); + double calcLoonIndex(string, string, string, int, vector >&); + double classifyChimera(double, double, double, double, double); + +private: + MothurOut* m; + int toInt(char); + double basicPairwiseAlignSeqs(string, string, string&, string&, pwModel); + int getDiffs(string, string, vector&, vector&, vector&, vector&); + int getLastMatch(char, vector >&, int, int, string&, string&); + int threeWayAlign(string, string, string, string&, string&, string&); + double calcBestDistance(string, string); + + +}; +/**************************************************************************************************/ +#endif + + diff --git a/phylosummary.cpp b/phylosummary.cpp index b645bac..22e8e75 100644 --- a/phylosummary.cpp +++ b/phylosummary.cpp @@ -131,7 +131,7 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ int level = 0; //are there confidence scores, if so remove them - if (seqTaxonomy.find_first_of('(') != -1) { removeConfidences(seqTaxonomy); } + if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); } while (seqTaxonomy != "") { @@ -229,7 +229,7 @@ int PhyloSummary::addSeqToTree(string seqTaxonomy, vector names){ int level = 0; //are there confidence scores, if so remove them - if (seqTaxonomy.find_first_of('(') != -1) { removeConfidences(seqTaxonomy); } + if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); } while (seqTaxonomy != "") { @@ -496,35 +496,6 @@ void PhyloSummary::readTreeStruct(ifstream& in){ } } /**************************************************************************************************/ -void PhyloSummary::removeConfidences(string& tax) { - try { - - string taxon; - string newTax = ""; - - while (tax.find_first_of(';') != -1) { - //get taxon - taxon = tax.substr(0,tax.find_first_of(';')); - - int pos = taxon.find_first_of('('); - if (pos != -1) { - taxon = taxon.substr(0, pos); //rip off confidence - } - - taxon += ";"; - - tax = tax.substr(tax.find_first_of(';')+1, tax.length()); - newTax += taxon; - } - - tax = newTax; - } - catch(exception& e) { - m->errorOut(e, "PhyloSummary", "removeConfidences"); - exit(1); - } -} -/**************************************************************************************************/ diff --git a/phylosummary.h b/phylosummary.h index 3adcc3f..cdec0d0 100644 --- a/phylosummary.h +++ b/phylosummary.h @@ -55,8 +55,6 @@ private: int numSeqs; int maxLevel; MothurOut* m; - - void removeConfidences(string&); }; /**************************************************************************************************/ diff --git a/removelineagecommand.cpp b/removelineagecommand.cpp index b72f4b6..f55158f 100644 --- a/removelineagecommand.cpp +++ b/removelineagecommand.cpp @@ -555,7 +555,8 @@ int RemoveLineageCommand::readTax(){ if (hasConPos != string::npos) { taxonsHasConfidence[i] = true; searchTaxons[i] = getTaxons(listOfTaxons[i]); - noConfidenceTaxons[i] = removeConfidences(listOfTaxons[i]); + noConfidenceTaxons[i] = listOfTaxons[i]; + m->removeConfidences(noConfidenceTaxons[i]); } } @@ -577,7 +578,8 @@ int RemoveLineageCommand::readTax(){ int hasConfidences = tax.find_first_of('('); if (hasConfidences != string::npos) { - newtax = removeConfidences(tax); + newtax = tax; + m->removeConfidences(newtax); } int pos = newtax.find(noConfidenceTaxons[j]); @@ -609,7 +611,8 @@ int RemoveLineageCommand::readTax(){ string noNewTax = tax; int hasConfidences = tax.find_first_of('('); if (hasConfidences != string::npos) { - noNewTax = removeConfidences(tax); + noNewTax = tax; + m->removeConfidences(noNewTax); } int pos = noNewTax.find(noConfidenceTaxons[j]); @@ -726,29 +729,6 @@ vector< map > RemoveLineageCommand::getTaxons(string tax) { exit(1); } } -/**************************************************************************************************/ -string RemoveLineageCommand::removeConfidences(string tax) { - try { - - string taxon = ""; - int taxLength = tax.length(); - for(int i=0;ierrorOut(e, "RemoveLineageCommand", "removeConfidences"); - exit(1); - } -} //********************************************************************************************************************** //alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name int RemoveLineageCommand::readAlign(){ diff --git a/removelineagecommand.h b/removelineagecommand.h index bda398b..a4ffea9 100644 --- a/removelineagecommand.h +++ b/removelineagecommand.h @@ -42,7 +42,6 @@ class RemoveLineageCommand : public Command { int readAlign(); int readList(); int readTax(); - string removeConfidences(string); vector< map > getTaxons(string); }; diff --git a/taxonomyequalizer.cpp b/taxonomyequalizer.cpp index 2f6fc77..c050726 100644 --- a/taxonomyequalizer.cpp +++ b/taxonomyequalizer.cpp @@ -46,7 +46,7 @@ TaxEqualizer::TaxEqualizer(string tfile, int c, string o) : cutoff(c), outputDir in >> name >> tax; m->gobble(in); - if (containsConfidence) { removeConfidences(tax); } + if (containsConfidence) { m->removeConfidences(tax); } //is this a taxonomy that needs to be extended? if (seqLevels[name] < highestLevel) { @@ -152,29 +152,4 @@ void TaxEqualizer::truncateTaxonomy(string name, string& tax, int desiredLevel) } } /**************************************************************************************************/ -void TaxEqualizer::removeConfidences(string& tax) { - try { - - string taxon; - string newTax = ""; - - while (tax.find_first_of(';') != -1) { - //get taxon - taxon = tax.substr(0,tax.find_first_of(';')); - taxon = taxon.substr(0, taxon.find_first_of('(')); //rip off confidence - taxon += ";"; - - tax = tax.substr(tax.find_first_of(';')+1, tax.length()); - newTax += taxon; - } - - tax = newTax; - } - catch(exception& e) { - m->errorOut(e, "TaxEqualizer", "removeConfidences"); - exit(1); - } -} - -/**************************************************************************************************/ diff --git a/taxonomyequalizer.h b/taxonomyequalizer.h index c1e43f4..191267c 100644 --- a/taxonomyequalizer.h +++ b/taxonomyequalizer.h @@ -39,7 +39,6 @@ private: int getHighestLevel(ifstream&); //scans taxonomy file to find taxonomy with highest level void extendTaxonomy(string, string&, int); //name, taxonomy, desired level void truncateTaxonomy(string, string&, int); //name, taxonomy, desired level - void removeConfidences(string&); //removes the confidence limits on the taxon MothurOut* m; }; -- 2.39.2