From 7a1e4b563011b1fe4d466d64915a3cb960747125 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Mon, 3 Feb 2014 16:26:20 -0500 Subject: [PATCH] added mergesfffiles command. added current directories to get.current commands outputs. modified deunique.seqs output filename creation. --- Mothur.xcodeproj/project.pbxproj | 6 + commandfactory.cpp | 5 + commandfactory.hpp | 1 + deuniqueseqscommand.cpp | 8 +- getcurrentcommand.cpp | 29 +- getcurrentcommand.h | 2 + mergesfffilecommand.cpp | 780 +++++++++++++++++++++++++++++++ mergesfffilecommand.h | 57 +++ mothur.h | 46 ++ pam.cpp | 1 + pam.h | 1 + sffinfocommand.h | 44 -- 12 files changed, 929 insertions(+), 51 deletions(-) create mode 100644 mergesfffilecommand.cpp create mode 100644 mergesfffilecommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index f0a56a3..c604120 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -10,6 +10,7 @@ 219C1DE01552C4BD004209F9 /* newcommandtemplate.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 219C1DDF1552C4BD004209F9 /* newcommandtemplate.cpp */; }; 219C1DE41559BCCF004209F9 /* getcoremicrobiomecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 219C1DE31559BCCD004209F9 /* getcoremicrobiomecommand.cpp */; }; 483C952E188F0CAD0035E7B7 /* sharedrjsd.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 483C952D188F0CAD0035E7B7 /* sharedrjsd.cpp */; }; + 48E981CF189C38FB0042BE9D /* mergesfffilecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48E981CE189C38FB0042BE9D /* mergesfffilecommand.cpp */; }; 7E6BE10A12F710D8007ADDBE /* refchimeratest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */; }; 834D9D581656D7C400E7FAB9 /* regularizedrandomforest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 834D9D561656D7C400E7FAB9 /* regularizedrandomforest.cpp */; }; 834D9D5C1656DEC800E7FAB9 /* regularizeddecisiontree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 834D9D5A1656DEC700E7FAB9 /* regularizeddecisiontree.cpp */; }; @@ -409,6 +410,8 @@ 219C1DE51559BCF2004209F9 /* getcoremicrobiomecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getcoremicrobiomecommand.h; sourceTree = ""; }; 483C952C188F0C960035E7B7 /* sharedrjsd.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = sharedrjsd.h; sourceTree = ""; }; 483C952D188F0CAD0035E7B7 /* sharedrjsd.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedrjsd.cpp; sourceTree = ""; }; + 48E981CD189C38E60042BE9D /* mergesfffilecommand.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = mergesfffilecommand.h; sourceTree = ""; }; + 48E981CE189C38FB0042BE9D /* mergesfffilecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergesfffilecommand.cpp; sourceTree = ""; }; 7E6BE10812F710D8007ADDBE /* refchimeratest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = refchimeratest.h; sourceTree = ""; }; 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = refchimeratest.cpp; sourceTree = ""; }; 7E78911B135F3E8600E725D2 /* eachgapdistignorens.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eachgapdistignorens.h; sourceTree = ""; }; @@ -1512,6 +1515,8 @@ A7E9B75312D37EC400DA6239 /* mergefilecommand.cpp */, A71FE12A12EDF72400963CA7 /* mergegroupscommand.h */, A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */, + 48E981CD189C38E60042BE9D /* mergesfffilecommand.h */, + 48E981CE189C38FB0042BE9D /* mergesfffilecommand.cpp */, A799314816CBD0BC0017E888 /* mergetaxsummarycommand.h */, A799314A16CBD0CD0017E888 /* mergetaxsummarycommand.cpp */, A7E9B75812D37EC400DA6239 /* metastatscommand.h */, @@ -2102,6 +2107,7 @@ A7E9B89712D37EC400DA6239 /* chimerabellerophoncommand.cpp in Sources */, A7E9B89812D37EC400DA6239 /* chimeraccodecommand.cpp in Sources */, A7E9B89912D37EC400DA6239 /* chimeracheckcommand.cpp in Sources */, + 48E981CF189C38FB0042BE9D /* mergesfffilecommand.cpp in Sources */, A7E9B89A12D37EC400DA6239 /* chimeracheckrdp.cpp in Sources */, A7E9B89B12D37EC400DA6239 /* chimerapintailcommand.cpp in Sources */, A7E9B89C12D37EC400DA6239 /* chimerarealigner.cpp in Sources */, diff --git a/commandfactory.cpp b/commandfactory.cpp index 5a32ac1..e8e9c27 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -149,6 +149,7 @@ #include "lefsecommand.h" #include "kruskalwalliscommand.h" #include "sracommand.h" +#include "mergesfffilecommand.h" /*******************************************************/ @@ -321,6 +322,7 @@ CommandFactory::CommandFactory(){ commands["lefse"] = "lefse"; commands["kruskal.wallis"] = "kruskal.wallis"; commands["sra"] = "sra"; + commands["merge.sfffiles"] = "merge.sfffiles"; } @@ -549,6 +551,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "lefse") { command = new LefseCommand(optionString); } else if(commandName == "kruskal.wallis") { command = new KruskalWallisCommand(optionString); } else if(commandName == "sra") { command = new SRACommand(optionString); } + else if(commandName == "merge.sfffiles") { command = new MergeSfffilesCommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -718,6 +721,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "lefse") { pipecommand = new LefseCommand(optionString); } else if(commandName == "kruskal.wallis") { pipecommand = new KruskalWallisCommand(optionString); } else if(commandName == "sra") { pipecommand = new SRACommand(optionString); } + else if(commandName == "merge.sfffiles") { pipecommand = new MergeSfffilesCommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -873,6 +877,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "lefse") { shellcommand = new LefseCommand(); } else if(commandName == "kruskal.wallis") { shellcommand = new KruskalWallisCommand(); } else if(commandName == "sra") { shellcommand = new SRACommand(); } + else if(commandName == "merge.sfffiles") { shellcommand = new MergeSfffilesCommand(); } else { shellcommand = new NoCommand(); } return shellcommand; diff --git a/commandfactory.hpp b/commandfactory.hpp index eb85c68..b1db192 100644 --- a/commandfactory.hpp +++ b/commandfactory.hpp @@ -33,6 +33,7 @@ public: string getLogfileName() { return logFileName; } bool getAppend() { return append; } string getOutputDir() { return outputDir; } + string getInputDir() { return inputDir; } bool MPIEnabled(string); map getListCommands() { return commands; } diff --git a/deuniqueseqscommand.cpp b/deuniqueseqscommand.cpp index a6c151c..2ccb461 100644 --- a/deuniqueseqscommand.cpp +++ b/deuniqueseqscommand.cpp @@ -190,11 +190,9 @@ int DeUniqueSeqsCommand::execute() { ofstream out; string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(fastaFile); } - string outFastaFile = thisOutputDir + m->getRootName(m->getSimpleName(fastaFile)); - int pos = outFastaFile.find("unique"); - if (pos != string::npos) { outFastaFile = outputDir + outFastaFile.substr(0, pos); } - else { outFastaFile = outputDir + outFastaFile; } - map variables; + string outFastaFile = thisOutputDir + m->getRootName(m->getSimpleName(fastaFile)); + + map variables; variables["[filename]"] = outFastaFile; outFastaFile = getOutputFileName("fasta", variables); m->openOutputFile(outFastaFile, out); diff --git a/getcurrentcommand.cpp b/getcurrentcommand.cpp index d74786c..b9b907d 100644 --- a/getcurrentcommand.cpp +++ b/getcurrentcommand.cpp @@ -90,7 +90,9 @@ int GetCurrentCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } - + + cFactory = CommandFactory::getInstance(); + //user wants to clear a type if (types.size() != 0) { for (int i = 0; i < types.size(); i++) { @@ -158,8 +160,31 @@ int GetCurrentCommand::execute(){ m->mothurOutEndLine(); m->mothurOut("Current files saved by mothur:"); m->mothurOutEndLine(); m->printCurrentFiles(); } + + string inputDir = cFactory->getInputDir(); + if (inputDir != "") { + m->mothurOutEndLine(); m->mothurOut("Current input directory saved by mothur: " + inputDir); m->mothurOutEndLine(); + } + + string outputDir = cFactory->getOutputDir(); + if (outputDir != "") { + m->mothurOutEndLine(); m->mothurOut("Current output directory saved by mothur: " + outputDir); m->mothurOutEndLine(); + } + string defaultPath = m->getDefaultPath(); + if (defaultPath != "") { + m->mothurOutEndLine(); m->mothurOut("Current default directory saved by mothur: " + defaultPath); m->mothurOutEndLine(); + } + + + string temp = "./"; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else + temp = ".\\"; +#endif + temp = m->getFullPathName(temp); + m->mothurOutEndLine(); m->mothurOut("Current working directory: " + temp); m->mothurOutEndLine(); - return 0; + return 0; } catch(exception& e) { diff --git a/getcurrentcommand.h b/getcurrentcommand.h index 2d8dbb1..e426fe9 100644 --- a/getcurrentcommand.h +++ b/getcurrentcommand.h @@ -11,6 +11,7 @@ */ #include "command.hpp" +#include "commandfactory.hpp" class GetCurrentCommand : public Command { @@ -34,6 +35,7 @@ class GetCurrentCommand : public Command { private: + CommandFactory* cFactory; vector outputNames; bool abort; diff --git a/mergesfffilecommand.cpp b/mergesfffilecommand.cpp new file mode 100644 index 0000000..056d85c --- /dev/null +++ b/mergesfffilecommand.cpp @@ -0,0 +1,780 @@ +// +// mergesfffilecommand.cpp +// Mothur +// +// Created by Sarah Westcott on 1/31/14. +// Copyright (c) 2014 Schloss Lab. All rights reserved. +// + +#include "mergesfffilecommand.h" +#include "endiannessmacros.h" + +//********************************************************************************************************************** +vector MergeSfffilesCommand::setParameters(){ + try { + CommandParameter psff("sff", "InputTypes", "", "", "sffFile", "sffFile", "none","sff",false,false); parameters.push_back(psff); + CommandParameter pfile("file", "InputTypes", "", "", "sffFile", "sffFile", "none","sff",false,false); parameters.push_back(pfile); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "MergeSfffilesCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string MergeSfffilesCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The merge.sfffiles command reads a sff set of files or a file containing a list of sff files and merges the individual files into a single sff file.\n"; + helpString += "The merge.sfffiles command parameters are sff and file. sff or file is required. \n"; + helpString += "The sff parameter allows you to enter the sff list of sff files separated by -'s.\n"; + helpString += "The file parameter allows you to provide a file containing a list of sff files to merge. \n"; + helpString += "Example sffinfo(sff=mySffFile.sff-mySecond.sff).\n"; + helpString += "Note: No spaces between parameter labels (i.e. sff), '=' and parameters (i.e.yourSffFileName).\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "MergeSfffilesCommand", "getHelpString"); + exit(1); + } +} + +//********************************************************************************************************************** +string MergeSfffilesCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "sff") { pattern = "[filename],merge.sff"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "MergeSfffilesCommand", "getOutputPattern"); + exit(1); + } +} +//********************************************************************************************************************** +MergeSfffilesCommand::MergeSfffilesCommand(){ + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["sff"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "MergeSfffilesCommand", "MergeSfffilesCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +MergeSfffilesCommand::MergeSfffilesCommand(string option) { + try { + abort = false; calledHelp = false; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + //valid paramters for this command + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + map::iterator it; + + ValidParameters validParameter; + //check to make sure all parameters are valid for command + for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //initialize outputTypes + vector tempOutNames; + outputTypes["sff"] = tempOutNames; + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } + else { + it = parameters.find("file"); + //user has given a template file + if(it != parameters.end()){ + string path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["file"] = inputDir + it->second; } + } + } + + sffFilename = validParameter.validFile(parameters, "sff", false); + if (sffFilename == "not found") { sffFilename = ""; } + else { + m->splitAtDash(sffFilename, filenames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < filenames.size(); i++) { + bool ignore = false; + if (filenames[i] == "current") { + filenames[i] = m->getSFFFile(); + if (filenames[i] != "") { m->mothurOut("Using " + filenames[i] + " as input file for the sff parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current sfffile, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + filenames.erase(filenames.begin()+i); + i--; + } + } + + if (!ignore) { + if (inputDir != "") { + string path = m->hasPath(filenames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { filenames[i] = inputDir + filenames[i]; } + } + + ifstream in; + int ableToOpen = m->openInputFile(filenames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(filenames[i]); + m->mothurOut("Unable to open " + filenames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + filenames[i] = tryPath; + } + } + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(filenames[i]); + m->mothurOut("Unable to open " + filenames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + filenames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + filenames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + filenames.erase(filenames.begin()+i); + i--; + }else { m->setSFFFile(filenames[i]); } + } + } + } + + file = validParameter.validFile(parameters, "file", true); + if (file == "not open") { abort = true; } + else if (file == "not found") { file = ""; } + + if ((file == "") && (filenames.size() == 0)) { + m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; + } + + if ((file != "") && (filenames.size() != 0)) { //both are given + m->mothurOut("[ERROR]: cannot use file option and sff option at the same time, choose one."); m->mothurOutEndLine(); abort = true; + } + + } + } + catch(exception& e) { + m->errorOut(e, "MergeSfffilesCommand", "MergeSfffilesCommand"); + exit(1); + } +} +//********************************************************************************************************************** +int MergeSfffilesCommand::execute(){ + try { + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + map variables; + if (file != "") { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(file); } + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(file)); + readFile(); + }else { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(filenames[0]); } + variables["[filename]"] = thisOutputDir + "mergedSff"; + } + ofstream out; + outputFile = getOutputFileName("sff",variables); + m->openOutputFile(outputFile, out); + outputNames.push_back(outputFile); outputTypes["sff"].push_back(outputFile); + outputFileHeader = outputFile + ".headers"; + numTotalReads = 0; + + for (int s = 0; s < filenames.size(); s++) { + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + int start = time(NULL); + + filenames[s] = m->getFullPathName(filenames[s]); + m->mothurOut("\nMerging info from " + filenames[s] + " ..." ); m->mothurOutEndLine(); + + int numReads = mergeSffInfo(filenames[s], out); + + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to merge " + toString(numReads) + ".\n"); + } + out.close(); + + //create new common header and add to merged file + adjustCommonHeader(); + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + //set sff file as new current sff file + string current = ""; + itTypes = outputTypes.find("sff"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSFFFile(current); } + } + + //report output filenames + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "MergeSfffilesCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +int MergeSfffilesCommand::mergeSffInfo(string input, ofstream& out){ + try { + currentFileName = input; + + ifstream in; + m->openInputFileBinary(input, in); + + CommonHeader header; + readCommonHeader(in, header); + + int count = 0; + + //check magic number and version + if (header.magicNumber != 779314790) { m->mothurOut("Magic Number is not correct, not a valid .sff file"); m->mothurOutEndLine(); return count; } + if (header.version != "0001") { m->mothurOut("Version is not supported, only support version 0001."); m->mothurOutEndLine(); return count; } + + //save for adjustHeader sanity check + commonHeaders.push_back(header); + + //read through the sff file + while (!in.eof()) { + + //read data + seqRead read; Header readheader; + readSeqData(in, read, header.numFlowsPerRead, readheader, out); + + bool okay = sanityCheck(readheader, read); + if (!okay) { break; } + + count++; + + //report progress + if((count+1) % 10000 == 0){ m->mothurOut(toString(count+1)); m->mothurOutEndLine(); } + + if (m->control_pressed) { count = 0; break; } + + if (count >= header.numReads) { break; } + } + + //report progress + if (!m->control_pressed) { if((count) % 10000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } } + + in.close(); + + return count; + } + catch(exception& e) { + m->errorOut(e, "MergeSfffilesCommand", "mergeSffInfo"); + exit(1); + } +} +//********************************************************************************************************************** +int MergeSfffilesCommand::readCommonHeader(ifstream& in, CommonHeader& header){ + try { + + if (!in.eof()) { + + //read magic number + char buffer[4]; + in.read(buffer, 4); + header.magicNumber = be_int4(*(unsigned int *)(&buffer)); + + //read version + char buffer9[4]; + in.read(buffer9, 4); + header.version = ""; + for (int i = 0; i < 4; i++) { header.version += toString((int)(buffer9[i])); } + + //read offset + char buffer2 [8]; + in.read(buffer2, 8); + header.indexOffset = be_int8(*(unsigned long long *)(&buffer2)); + + //read index length + char buffer3 [4]; + in.read(buffer3, 4); + header.indexLength = be_int4(*(unsigned int *)(&buffer3)); + + //read num reads + char buffer4 [4]; + in.read(buffer4, 4); + header.numReads = be_int4(*(unsigned int *)(&buffer4)); + + if (m->debug) { m->mothurOut("[DEBUG]: numReads = " + toString(header.numReads) + "\n"); } + + //read header length + char buffer5 [2]; + in.read(buffer5, 2); + header.headerLength = be_int2(*(unsigned short *)(&buffer5)); + + //read key length + char buffer6 [2]; + in.read(buffer6, 2); + header.keyLength = be_int2(*(unsigned short *)(&buffer6)); + + //read number of flow reads + char buffer7 [2]; + in.read(buffer7, 2); + header.numFlowsPerRead = be_int2(*(unsigned short *)(&buffer7)); + + //read format code + char buffer8 [1]; + in.read(buffer8, 1); + header.flogramFormatCode = (int)(buffer8[0]); + + //read flow chars + char* tempBuffer = new char[header.numFlowsPerRead]; + in.read(&(*tempBuffer), header.numFlowsPerRead); + header.flowChars = tempBuffer; + if (header.flowChars.length() > header.numFlowsPerRead) { header.flowChars = header.flowChars.substr(0, header.numFlowsPerRead); } + delete[] tempBuffer; + + //read key + char* tempBuffer2 = new char[header.keyLength]; + in.read(&(*tempBuffer2), header.keyLength); + header.keySequence = tempBuffer2; + if (header.keySequence.length() > header.keyLength) { header.keySequence = header.keySequence.substr(0, header.keyLength); } + delete[] tempBuffer2; + + /* Pad to 8 chars */ + unsigned long long spotInFile = in.tellg(); + unsigned long long spot = (spotInFile + 7)& ~7; // ~ inverts + in.seekg(spot); + + }else{ + m->mothurOut("Error reading sff common header."); m->mothurOutEndLine(); + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "MergeSfffilesCommand", "readCommonHeader"); + exit(1); + } +} +//********************************************************************************************************************** +int MergeSfffilesCommand::adjustCommonHeader(){ + try { + //sanity check + bool okay = true; + if (commonHeaders.size() != 0) { + unsigned int magicN = commonHeaders[0].magicNumber; + string version = commonHeaders[0].version; + unsigned short headerLength = commonHeaders[0].headerLength; + unsigned short keyLength = commonHeaders[0].keyLength; + unsigned short numFlows = commonHeaders[0].numFlowsPerRead; + int flowCode = commonHeaders[0].flogramFormatCode; + string flowChars = commonHeaders[0].flowChars; + string keySeq = commonHeaders[0].keySequence; + + for (int i = 1; i < commonHeaders.size(); i++) { + if (commonHeaders[i].magicNumber != magicN) { okay = false; m->mothurOut("[ERROR]: merge issue with common headers. Magic numbers do not match. " + filenames[0] + " magic number is " + toString(commonHeaders[0].magicNumber) + ", but " + filenames[i] + " magic number is " + toString(commonHeaders[i].magicNumber) + ".\n"); } + if (commonHeaders[i].version != version) { okay = false; m->mothurOut("[ERROR]: merge issue with common headers. Versions do not match. " + filenames[0] + " version is " + commonHeaders[0].version + ", but " + filenames[i] + " version is " + commonHeaders[i].version + ".\n"); } + if (commonHeaders[i].headerLength != headerLength) { okay = false; m->mothurOut("[ERROR]: merge issue with common headers. Header lengths do not match. " + filenames[0] + " header length is " + toString(commonHeaders[0].headerLength) + ", but " + filenames[i] + " header length is " + toString(commonHeaders[i].headerLength) + ".\n"); } + if (commonHeaders[i].keyLength != keyLength) { okay = false; m->mothurOut("[ERROR]: merge issue with common headers. Key Lengths do not match. " + filenames[0] + " Key length is " + toString(commonHeaders[0].keyLength) + ", but " + filenames[i] + " key length is " + toString(commonHeaders[i].keyLength) + ".\n"); } + if (commonHeaders[i].numFlowsPerRead != numFlows) { okay = false; m->mothurOut("[ERROR]: merge issue with common headers. Number of flows per read do not match. " + filenames[0] + " number of flows is " + toString(commonHeaders[0].numFlowsPerRead) + ", but " + filenames[i] + " number of flows is " + toString(commonHeaders[i].numFlowsPerRead) + ".\n"); } + if (commonHeaders[i].flogramFormatCode != flowCode) { okay = false; m->mothurOut("[ERROR]: merge issue with common headers. Flow format codes do not match. " + filenames[0] + " Flow format code is " + toString(commonHeaders[0].flogramFormatCode) + ", but " + filenames[i] + " flow format code is " + toString(commonHeaders[i].flogramFormatCode) + ".\n"); } + if (commonHeaders[i].flowChars != flowChars) { okay = false; m->mothurOut("[ERROR]: merge issue with common headers. Flow characters do not match. " + filenames[0] + " Flow characters are " + commonHeaders[0].flowChars + ", but " + filenames[i] + " flow characters are " + commonHeaders[i].flowChars + ".\n"); } + if (commonHeaders[i].keySequence != keySeq) { okay = false; m->mothurOut("[ERROR]: merge issue with common headers. Key sequences do not match. " + filenames[0] + " Key sequence is " + commonHeaders[0].keySequence + ", but " + filenames[i] + " key sequence is " + commonHeaders[i].keySequence + ".\n"); } + } + }else { m->control_pressed = true; return 0; } //should never get here + + if (!okay) { m->control_pressed = true; return 0; } + + string endian = m->findEdianness(); + char* mybuffer = new char[4]; + ifstream in; + m->openInputFileBinary(currentFileName, in); + + //magic number + in.read(mybuffer,4); + ofstream out; + m->openOutputFileBinaryAppend(outputFileHeader, out); + out.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //version + mybuffer = new char[4]; + in.read(mybuffer,4); + out.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //offset + mybuffer = new char[8]; + in.read(mybuffer,8); + unsigned long long offset = 0; + char* thisbuffer = new char[8]; + thisbuffer[0] = (offset >> 56) & 0xFF; + thisbuffer[1] = (offset >> 48) & 0xFF; + thisbuffer[2] = (offset >> 40) & 0xFF; + thisbuffer[3] = (offset >> 32) & 0xFF; + thisbuffer[4] = (offset >> 24) & 0xFF; + thisbuffer[5] = (offset >> 16) & 0xFF; + thisbuffer[6] = (offset >> 8) & 0xFF; + thisbuffer[7] = offset & 0xFF; + out.write(thisbuffer, 8); + delete[] thisbuffer; + delete[] mybuffer; + + //read index length + mybuffer = new char[4]; + in.read(mybuffer,4); + offset = 0; + char* thisbuffer2 = new char[4]; + thisbuffer2[0] = (offset >> 24) & 0xFF; + thisbuffer2[1] = (offset >> 16) & 0xFF; + thisbuffer2[2] = (offset >> 8) & 0xFF; + thisbuffer2[3] = offset & 0xFF; + out.write(thisbuffer2, 4); + delete[] thisbuffer2; + delete[] mybuffer; + + //change num reads + mybuffer = new char[4]; + in.read(mybuffer,4); + delete[] mybuffer; + thisbuffer2 = new char[4]; + if (endian == "BIG_ENDIAN") { + thisbuffer2[0] = (numTotalReads >> 24) & 0xFF; + thisbuffer2[1] = (numTotalReads >> 16) & 0xFF; + thisbuffer2[2] = (numTotalReads >> 8) & 0xFF; + thisbuffer2[3] = numTotalReads & 0xFF; + }else { + thisbuffer2[0] = numTotalReads & 0xFF; + thisbuffer2[1] = (numTotalReads >> 8) & 0xFF; + thisbuffer2[2] = (numTotalReads >> 16) & 0xFF; + thisbuffer2[3] = (numTotalReads >> 24) & 0xFF; + } + out.write(thisbuffer2, 4); + delete[] thisbuffer2; + + + //read header length + mybuffer = new char[2]; + in.read(mybuffer,2); + out.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //read key length + mybuffer = new char[2]; + in.read(mybuffer,2); + out.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //read number of flow reads + mybuffer = new char[2]; + in.read(mybuffer,2); + out.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //read format code + mybuffer = new char[1]; + in.read(mybuffer,1); + out.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //read flow chars + mybuffer = new char[commonHeaders[0].numFlowsPerRead]; + in.read(mybuffer,commonHeaders[0].numFlowsPerRead); + out.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //read key + mybuffer = new char[commonHeaders[0].keyLength]; + in.read(mybuffer,commonHeaders[0].keyLength); + out.write(mybuffer, in.gcount()); + delete[] mybuffer; + + /* Pad to 8 chars */ + unsigned long long spotInFile = in.tellg(); + unsigned long long spot = (spotInFile + 7)& ~7; // ~ inverts + in.seekg(spot); + + mybuffer = new char[spot-spotInFile]; + out.write(mybuffer, spot-spotInFile); + delete[] mybuffer; + in.close(); + out.close(); + + m->appendBinaryFiles(outputFile, outputFileHeader); + m->renameFile(outputFileHeader, outputFile); + m->mothurRemove(outputFileHeader); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "MergeSfffilesCommand", "adjustCommonHeader"); + exit(1); + } +} +//********************************************************************************************************************** +bool MergeSfffilesCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, Header& header, ofstream& out){ + try { + unsigned long long startSpotInFile = in.tellg(); + if (!in.eof()) { + + /*****************************************/ + //read header + + //read header length + char buffer [2]; + in.read(buffer, 2); + header.headerLength = be_int2(*(unsigned short *)(&buffer)); + + //read name length + char buffer2 [2]; + in.read(buffer2, 2); + header.nameLength = be_int2(*(unsigned short *)(&buffer2)); + + //read num bases + char buffer3 [4]; + in.read(buffer3, 4); + header.numBases = be_int4(*(unsigned int *)(&buffer3)); + + + //read clip qual left + char buffer4 [2]; + in.read(buffer4, 2); + header.clipQualLeft = be_int2(*(unsigned short *)(&buffer4)); + header.clipQualLeft = 5; + + + //read clip qual right + char buffer5 [2]; + in.read(buffer5, 2); + header.clipQualRight = be_int2(*(unsigned short *)(&buffer5)); + + + //read clipAdapterLeft + char buffer6 [2]; + in.read(buffer6, 2); + header.clipAdapterLeft = be_int2(*(unsigned short *)(&buffer6)); + + + //read clipAdapterRight + char buffer7 [2]; + in.read(buffer7, 2); + header.clipAdapterRight = be_int2(*(unsigned short *)(&buffer7)); + + + //read name + char* tempBuffer = new char[header.nameLength]; + in.read(&(*tempBuffer), header.nameLength); + header.name = tempBuffer; + if (header.name.length() > header.nameLength) { header.name = header.name.substr(0, header.nameLength); } + + delete[] tempBuffer; + + /* Pad to 8 chars */ + unsigned long long spotInFile = in.tellg(); + unsigned long long spot = (spotInFile + 7)& ~7; + in.seekg(spot); + + /*****************************************/ + //sequence read + + //read flowgram + read.flowgram.resize(numFlowReads); + for (int i = 0; i < numFlowReads; i++) { + char buffer [2]; + in.read(buffer, 2); + read.flowgram[i] = be_int2(*(unsigned short *)(&buffer)); + } + + //read flowIndex + read.flowIndex.resize(header.numBases); + for (int i = 0; i < header.numBases; i++) { + char temp[1]; + in.read(temp, 1); + read.flowIndex[i] = be_int1(*(unsigned char *)(&temp)); + } + + //read bases + char* tempBuffer6 = new char[header.numBases]; + in.read(&(*tempBuffer6), header.numBases); + read.bases = tempBuffer6; + if (read.bases.length() > header.numBases) { read.bases = read.bases.substr(0, header.numBases); } + delete[] tempBuffer6; + + //read qual scores + read.qualScores.resize(header.numBases); + for (int i = 0; i < header.numBases; i++) { + char temp[1]; + in.read(temp, 1); + read.qualScores[i] = be_int1(*(unsigned char *)(&temp)); + } + + /* Pad to 8 chars */ + spotInFile = in.tellg(); + spot = (spotInFile + 7)& ~7; + in.seekg(spot); + + char * mybuffer; + mybuffer = new char [spot-startSpotInFile]; + + ifstream in2; + m->openInputFileBinary(currentFileName, in2); + in2.seekg(startSpotInFile); + in2.read(mybuffer,spot-startSpotInFile); + + out.write(mybuffer, in2.gcount()); + numTotalReads++; + + delete[] mybuffer; + in2.close(); + + }else{ + m->mothurOut("Error reading."); m->mothurOutEndLine(); + } + + if (in.eof()) { return true; } + + return false; + } + catch(exception& e) { + m->errorOut(e, "MergeSfffilesCommand", "readSeqData"); + exit(1); + } +} +//********************************************************************************************************************** +bool MergeSfffilesCommand::sanityCheck(Header& header, seqRead& read) { + try { + bool okay = true; + string message = "[WARNING]: Your sff file may be corrupted! Sequence: " + header.name + "\n"; + + if (header.clipQualLeft > read.bases.length()) { + okay = false; message += "Clip Qual Left = " + toString(header.clipQualLeft) + ", but we only read " + toString(read.bases.length()) + " bases.\n"; + } + if (header.clipQualRight > read.bases.length()) { + okay = false; message += "Clip Qual Right = " + toString(header.clipQualRight) + ", but we only read " + toString(read.bases.length()) + " bases.\n"; + } + if (header.clipQualLeft > read.qualScores.size()) { + okay = false; message += "Clip Qual Left = " + toString(header.clipQualLeft) + ", but we only read " + toString(read.qualScores.size()) + " quality scores.\n"; + } + if (header.clipQualRight > read.qualScores.size()) { + okay = false; message += "Clip Qual Right = " + toString(header.clipQualRight) + ", but we only read " + toString(read.qualScores.size()) + " quality scores.\n"; + } + + if (okay == false) { + m->mothurOut(message); m->mothurOutEndLine(); + } + + return okay; + } + catch(exception& e) { + m->errorOut(e, "MergeSfffilesCommand", "sanityCheck"); + exit(1); + } +} +//********************************************************************************************************************** +int MergeSfffilesCommand::readFile(){ + try { + + string filename; + + ifstream in; + m->openInputFile(file, in); + + while(!in.eof()) { + + if (m->control_pressed) { return 0; } + + in >> filename; m->gobble(in); + + if (m->debug) { m->mothurOut("[DEBUG]: filename = " + filename + ".\n"); } + + //check to make sure both are able to be opened + ifstream in2; + int openForward = m->openInputFile(filename, in2, "noerror"); + + //if you can't open it, try default location + if (openForward == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(filename); + m->mothurOut("Unable to open " + filename + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in3; + openForward = m->openInputFile(tryPath, in3, "noerror"); + in3.close(); + filename = tryPath; + } + } + + //if you can't open it, try output location + if (openForward == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(filename); + m->mothurOut("Unable to open " + filename + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in4; + openForward = m->openInputFile(tryPath, in4, "noerror"); + filename = tryPath; + in4.close(); + } + } + + if (openForward == 1) { //can't find it + m->mothurOut("[WARNING]: can't find " + filename + ", ignoring.\n"); + }else{ filenames.push_back(filename); } + + } + in.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "MergeSfffilesCommand", "readFileNames"); + exit(1); + } +} +//********************************************************************************************************************** + + + + diff --git a/mergesfffilecommand.h b/mergesfffilecommand.h new file mode 100644 index 0000000..3208e61 --- /dev/null +++ b/mergesfffilecommand.h @@ -0,0 +1,57 @@ +// +// mergesfffilecommand.h +// Mothur +// +// Created by Sarah Westcott on 1/31/14. +// Copyright (c) 2014 Schloss Lab. All rights reserved. +// + +#ifndef Mothur_mergesfffilecommand_h +#define Mothur_mergesfffilecommand_h + +#include "command.hpp" + + +/**********************************************************/ +class MergeSfffilesCommand : public Command { + +public: + MergeSfffilesCommand(string); + MergeSfffilesCommand(); + ~MergeSfffilesCommand(){} + + vector setParameters(); + string getCommandName() { return "merge.sfffiles"; } + string getCommandCategory() { return "Sequence Processing"; } + + string getHelpString(); + string getOutputPattern(string); + string getCitation() { return "http://www.mothur.org/wiki/merge.sfffiles"; } + string getDescription() { return "merge individual sfffiles into a single .sff file"; } + + int execute(); + void help() { m->mothurOut(getHelpString()); } + +private: + string sffFilename, outputDir, file, currentFileName; + vector filenames, outputNames; + bool abort; + int numTotalReads, allFilesnumFlowReads, allFileskeyLength; + string outputFile, outputFileHeader; + vector commonHeaders; + + //extract sff file functions + int mergeSffInfo(string, ofstream&); + int readCommonHeader(ifstream&, CommonHeader&); + int readHeader(ifstream&, Header&); + bool readSeqData(ifstream&, seqRead&, int, Header&, ofstream&); + int decodeName(string&, string&, string&, string); + + bool sanityCheck(Header&, seqRead&); + int adjustCommonHeader(); + int readFile(); +}; + +/**********************************************************/ + +#endif diff --git a/mothur.h b/mothur.h index 32f4778..67c8ab3 100644 --- a/mothur.h +++ b/mothur.h @@ -120,6 +120,52 @@ struct diffPair { reverseProb = rp; } }; + +/**********************************************************/ +struct CommonHeader { + unsigned int magicNumber; + string version; + unsigned long long indexOffset; + unsigned int indexLength; + unsigned int numReads; + unsigned short headerLength; + unsigned short keyLength; + unsigned short numFlowsPerRead; + int flogramFormatCode; + string flowChars; //length depends on number flow reads + string keySequence; //length depends on key length + + CommonHeader(){ magicNumber=0; indexOffset=0; indexLength=0; numReads=0; headerLength=0; keyLength=0; numFlowsPerRead=0; flogramFormatCode='s'; } + ~CommonHeader() { } +}; +/**********************************************************/ +struct Header { + unsigned short headerLength; + unsigned short nameLength; + unsigned int numBases; + unsigned short clipQualLeft; + unsigned short clipQualRight; + unsigned short clipAdapterLeft; + unsigned short clipAdapterRight; + string name; //length depends on nameLength + string timestamp; + string region; + string xy; + + Header() { headerLength=0; nameLength=0; numBases=0; clipQualLeft=0; clipQualRight=0; clipAdapterLeft=0; clipAdapterRight=0; } + ~Header() { } +}; +/**********************************************************/ +struct seqRead { + vector flowgram; + vector flowIndex; + string bases; + vector qualScores; + + seqRead() { } + ~seqRead() { } +}; + /***********************************************************************/ struct PDistCell{ ull index; diff --git a/pam.cpp b/pam.cpp index d33bab8..98b2113 100644 --- a/pam.cpp +++ b/pam.cpp @@ -35,6 +35,7 @@ Pam::Pam(vector > c, vector > d, int p) : CommunityTy } } /**************************************************************************************************/ +//build and swap functions based on pam.c by maechler from R cluster package //sets Dp[0] does not set Dp[1]. chooses intial medoids. int Pam::buildPhase() { try { diff --git a/pam.h b/pam.h index ab5c68f..2d60d4e 100644 --- a/pam.h +++ b/pam.h @@ -11,6 +11,7 @@ #include "communitytype.h" +//Partitioning Around Medoids /**************************************************************************************************/ class Pam : public CommunityTypeFinder { diff --git a/sffinfocommand.h b/sffinfocommand.h index 1909e2d..12f0180 100644 --- a/sffinfocommand.h +++ b/sffinfocommand.h @@ -13,50 +13,6 @@ #include "command.hpp" #include "groupmap.h" -/**********************************************************/ -struct CommonHeader { - unsigned int magicNumber; - string version; - unsigned long long indexOffset; - unsigned int indexLength; - unsigned int numReads; - unsigned short headerLength; - unsigned short keyLength; - unsigned short numFlowsPerRead; - int flogramFormatCode; - string flowChars; //length depends on number flow reads - string keySequence; //length depends on key length - - CommonHeader(){ magicNumber=0; indexOffset=0; indexLength=0; numReads=0; headerLength=0; keyLength=0; numFlowsPerRead=0; flogramFormatCode='s'; } - ~CommonHeader() { } -}; -/**********************************************************/ -struct Header { - unsigned short headerLength; - unsigned short nameLength; - unsigned int numBases; - unsigned short clipQualLeft; - unsigned short clipQualRight; - unsigned short clipAdapterLeft; - unsigned short clipAdapterRight; - string name; //length depends on nameLength - string timestamp; - string region; - string xy; - - Header() { headerLength=0; nameLength=0; numBases=0; clipQualLeft=0; clipQualRight=0; clipAdapterLeft=0; clipAdapterRight=0; } - ~Header() { } -}; -/**********************************************************/ -struct seqRead { - vector flowgram; - vector flowIndex; - string bases; - vector qualScores; - - seqRead() { } - ~seqRead() { } -}; /**********************************************************/ class SffInfoCommand : public Command { -- 2.39.2