A7E9B98D12D37EC400DA6239 /* weighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87C12D37EC400DA6239 /* weighted.cpp */; };
A7E9B98E12D37EC400DA6239 /* weightedlinkage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */; };
A7E9B98F12D37EC400DA6239 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87F12D37EC400DA6239 /* whittaker.cpp */; };
+ A7EEB0F514F29BFE00344B83 /* classifytreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7EEB0F414F29BFD00344B83 /* classifytreecommand.cpp */; };
A7F9F5CF141A5E500032F693 /* sequenceparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */; };
A7FA10021302E097003860FE /* mantelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FA10011302E096003860FE /* mantelcommand.cpp */; };
A7FA2AC714A0E881007C09A6 /* bsplvb.f in Sources */ = {isa = PBXBuildFile; fileRef = A7FA2ABC14A0E881007C09A6 /* bsplvb.f */; };
A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = weightedlinkage.cpp; sourceTree = SOURCE_ROOT; };
A7E9B87F12D37EC400DA6239 /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = "<group>"; };
A7E9B88012D37EC400DA6239 /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = "<group>"; };
+ A7EEB0F414F29BFD00344B83 /* classifytreecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classifytreecommand.cpp; sourceTree = "<group>"; };
+ A7EEB0F714F29C1B00344B83 /* classifytreecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classifytreecommand.h; sourceTree = "<group>"; };
A7F9F5CD141A5E500032F693 /* sequenceparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequenceparser.h; sourceTree = "<group>"; };
A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequenceparser.cpp; sourceTree = "<group>"; };
A7FA10001302E096003860FE /* mantelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mantelcommand.h; sourceTree = "<group>"; };
A7E9B69012D37EC400DA6239 /* classifyotucommand.cpp */,
A7E9B69312D37EC400DA6239 /* classifyseqscommand.h */,
A7E9B69212D37EC400DA6239 /* classifyseqscommand.cpp */,
+ A7EEB0F714F29C1B00344B83 /* classifytreecommand.h */,
+ A7EEB0F414F29BFD00344B83 /* classifytreecommand.cpp */,
A7E9B69712D37EC400DA6239 /* clearcutcommand.h */,
A7E9B69612D37EC400DA6239 /* clearcutcommand.cpp */,
A73DDBB813C4A0D1006AAE38 /* clearmemorycommand.h */,
A7FA2B5B14A0F0C2007C09A6 /* intrv.f in Sources */,
A7A3C8C914D041AD00B1BFBE /* otuassociationcommand.cpp in Sources */,
A7A32DAA14DC43B00001D2E5 /* sortseqscommand.cpp in Sources */,
+ A7EEB0F514F29BFE00344B83 /* classifytreecommand.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
GCC_DYNAMIC_NO_PIC = NO;
GCC_ENABLE_FIX_AND_CONTINUE = YES;
GCC_MODEL_TUNING = G5;
- GCC_OPTIMIZATION_LEVEL = 3;
+ GCC_OPTIMIZATION_LEVEL = 0;
INSTALL_PATH = /usr/local/bin;
PRODUCT_NAME = Mothur;
SDKROOT = macosx10.6;
GCC_ENABLE_SSE3_EXTENSIONS = NO;
GCC_ENABLE_SSE41_EXTENSIONS = NO;
GCC_ENABLE_SSE42_EXTENSIONS = NO;
- GCC_OPTIMIZATION_LEVEL = 3;
+ GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
"MOTHUR_FILES=\"\\\"../release\\\"\"",
"VERSION=\"\\\"1.23.0\\\"\"",
GCC_C_LANGUAGE_STANDARD = gnu99;
GCC_GENERATE_DEBUGGING_SYMBOLS = NO;
GCC_MODEL_TUNING = "";
- GCC_OPTIMIZATION_LEVEL = 3;
+ GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
"VERSION=\"\\\"1.19.0\\\"\"",
"RELEASE_DATE=\"\\\"5/9/2011\\\"\"",
#include "blastdb.hpp"
#include "referencedb.h"
-/**************************************************************************************************/
-//deep copy
-AlignmentDB::AlignmentDB(const AlignmentDB& adb) : numSeqs(adb.numSeqs), longest(adb.longest), method(adb.method), emptySequence(adb.emptySequence), threadID(adb.threadID) {
- try {
-
- m = MothurOut::getInstance();
- if (adb.method == "blast") {
- search = new BlastDB(*((BlastDB*)adb.search));
- }else if(adb.method == "kmer") {
- search = new KmerDB(*((KmerDB*)adb.search));
- }else if(adb.method == "suffix") {
- search = new SuffixDB(*((SuffixDB*)adb.search));
- }else {
- m->mothurOut("[ERROR]: cannot create copy of alignment database, unrecognized method - " + adb.method); m->mothurOutEndLine();
- }
-
- for (int i = 0; i < adb.templateSequences.size(); i++) {
- Sequence temp(adb.templateSequences[i]);
- templateSequences.push_back(temp);
- }
- }
- catch(exception& e) {
- m->errorOut(e, "AlignmentDB", "AlignmentDB");
- exit(1);
- }
-
-}
/**************************************************************************************************/
AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch, int tid){ // This assumes that the template database is in fasta format, may
try { // need to alter this in the future?
AlignmentDB(string, string, int, float, float, float, float, int); //reads fastafile passed in and stores sequences
AlignmentDB(string);
- AlignmentDB(const AlignmentDB& adb);
~AlignmentDB();
Sequence findClosestSequence(Sequence*);
//initialze probabilities
wordGenusProb.resize(numKmers);
WordPairDiffArr.resize(numKmers);
- //cout << numKmers << '\t' << genusNodes.size() << endl;
+
for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
- //cout << numKmers << '\t' << genusNodes.size() << endl;
- ofstream out;
+ ofstream out;
ofstream out2;
#ifdef USE_MPI
exit(1);
}
}
-/**************************************************************************************************/
+**************************************************************************************************/
void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string inNumName) {
try{
public:
BlastDB(string, float, float, float, float, string, int);
BlastDB(string, int);
- BlastDB(const BlastDB& bdb) : dbFileName(bdb.dbFileName), queryFileName(bdb.queryFileName), blastFileName(bdb.blastFileName), path(bdb.path),
- count(bdb.count), gapOpen(bdb.gapOpen), gapExtend(bdb.gapExtend), match(bdb.match), misMatch(bdb.misMatch), Database(bdb) {}
~BlastDB();
void generateDB();
vector<seqData> sequences;
bool error = false;
+ alignLength = 0;
for (int i = 0; i < thisGroupsSeqs.size(); i++) {
else {
int num = m->getNumNames(it->second);
sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
+ if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
}
}
bool error = false;
ifstream in;
m->openInputFile(inputFile, in);
-
+ alignLength = 0;
+
while (!in.eof()) {
if (m->control_pressed) { in.close(); return sequences; }
if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + temp.getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
else {
sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), it->second));
+ if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); }
}
}
in.close();
}
int numSeqs = sequences.size();
- int alignLength = sequences[0].sequence.size();
+ //int alignLength = sequences[0].sequence.size();
ofstream chimeraFile;
ofstream accnosFile;
for(int i=0;i<numSeqs;i++){
if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
-
+
vector<bool> restricted = chimeras;
vector<vector<int> > leftDiffs(numSeqs);
string dummyA, dummyB;
- if(comparisons >= 2){
+ if (sequences[i].sequence.size() < 3) {
+ chimeraFile << i << '\t' << sequences[i].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
+ }else if(comparisons >= 2){
minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
bool abort;
string fastafile, groupfile, outputDir, namefile;
- int processors;
+ int processors, alignLength;
double cutoff, alpha, beta;
vector<string> outputNames;
for (int i = 0; i < temp.length(); i++) {
//eliminate N's
- if (toupper(temp[i]) == 'N') { temp[i] == '.'; }
+ if (toupper(temp[i]) == 'N') { temp[i] = '.'; }
numBasesCounted++;
for (int i = (temp.length()-1); i >= 0; i--) {
//eliminate N's
- if (toupper(temp[i]) == 'N') { temp[i] == '.'; }
+ if (toupper(temp[i]) == 'N') { temp[i] = '.'; }
numBasesCounted++;
for (int i = 0; i < temp.length(); i++) {
//eliminate N's
if (toupper(temp[i]) == 'N') {
- temp[i] == '.';
+ temp[i] = '.';
tempLength--;
if (tempLength < numbases) { stopSpot = 0; break; }
}
for (int i = (temp.length()-1); i >= 0; i--) {
//eliminate N's
if (toupper(temp[i]) == 'N') {
- temp[i] == '.';
+ temp[i] = '.';
tempLength--;
if (tempLength < numbases) { stopSpot = 0; break; }
}
string extension = "";
if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
- classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flipThreshold);
+ classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flip);
pDataArray.push_back(tempclass);
//MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
//make classify
Classify* myclassify;
if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); }
- else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID, pDataArray->flipThreshold); }
+ else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); }
else {
pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian.");
pDataArray->m->mothurOutEndLine();
--- /dev/null
+//
+// classifytreecommand.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 2/20/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "classifytreecommand.h"
+#include "phylotree.h"
+
+//**********************************************************************************************************************
+vector<string> ClassifyTreeCommand::setParameters(){
+ try {
+ CommandParameter ptree("tree", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptree);
+ CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptaxonomy);
+ CommandParameter pname("name", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pname);
+ CommandParameter pgroup("group", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pgroup);
+ CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+
+ vector<string> myArray;
+ for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
+ return myArray;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "setParameters");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string ClassifyTreeCommand::getHelpString(){
+ try {
+ string helpString = "";
+ helpString += "The classify.tree command reads a tree and taxonomy file and output the concensus taxonomy for each node on the tree. \n";
+ helpString += "If you provide a group file, the concensus for each group will also be provided. \n";
+ helpString += "The new tree contains labels at each internal node. The label is the node number so you can relate the tree to the summary file.\n";
+ helpString += "The summary file lists the concensus taxonomy for the descendants of each node.\n";
+ helpString += "The classify.tree command parameters are tree, group, name and taxonomy. The tree and taxonomy files are required.\n";
+ helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
+ helpString += "The classify.tree command should be used in the following format: classify.tree(tree=test.tre, group=test.group, taxonomy=test.taxonomy)\n";
+ helpString += "Note: No spaces between parameter labels (i.e. tree), '=' and parameters (i.e.yourTreefile).\n";
+ return helpString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "getHelpString");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+ClassifyTreeCommand::ClassifyTreeCommand(){
+ try {
+ abort = true; calledHelp = true;
+ setParameters();
+ vector<string> tempOutNames;
+ outputTypes["tree"] = tempOutNames;
+ outputTypes["summary"] = tempOutNames;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+ClassifyTreeCommand::ClassifyTreeCommand(string option) {
+ try {
+ abort = false; calledHelp = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; calledHelp = true; }
+ else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+
+ else {
+ vector<string> myArray = setParameters();
+
+ OptionParser parser(option);
+ map<string, string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+ map<string, string>::iterator it;
+
+ //check to make sure all parameters are valid for command
+ for (it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ m->runParse = true;
+ m->clearGroups();
+ m->clearAllGroups();
+ m->Treenames.clear();
+ m->names.clear();
+
+ vector<string> tempOutNames;
+ outputTypes["tree"] = tempOutNames;
+ outputTypes["summary"] = tempOutNames;
+
+ //if the user changes the input directory command factory will send this info to us in the output parameter
+ string inputDir = validParameter.validFile(parameters, "inputdir", false);
+ if (inputDir == "not found"){ inputDir = ""; }
+ else {
+ string path;
+ it = parameters.find("tree");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["tree"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("name");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["name"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("group");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["group"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("taxonomy");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
+ }
+ }
+
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+
+ //check for required parameters
+ treefile = validParameter.validFile(parameters, "tree", true);
+ if (treefile == "not open") { treefile = ""; abort = true; }
+ else if (treefile == "not found") { treefile = "";
+ treefile = m->getTreeFile();
+ if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("No valid current files. You must provide a tree file."); m->mothurOutEndLine(); abort = true; }
+ }else { m->setTreeFile(treefile); }
+
+ taxonomyfile = validParameter.validFile(parameters, "taxonomy", true);
+ if (taxonomyfile == "not open") { taxonomyfile = ""; abort = true; }
+ else if (taxonomyfile == "not found") { taxonomyfile = "";
+ taxonomyfile = m->getTaxonomyFile();
+ if (taxonomyfile != "") { m->mothurOut("Using " + taxonomyfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("No valid current files. You must provide a taxonomy file."); m->mothurOutEndLine(); abort = true; }
+ }else { m->setTaxonomyFile(taxonomyfile); }
+
+ namefile = validParameter.validFile(parameters, "name", true);
+ if (namefile == "not open") { namefile = ""; abort = true; }
+ else if (namefile == "not found") { namefile = ""; }
+ else { m->setNameFile(namefile); }
+
+ groupfile = validParameter.validFile(parameters, "group", true);
+ if (groupfile == "not open") { groupfile = ""; abort = true; }
+ else if (groupfile == "not found") { groupfile = ""; }
+ else { m->setGroupFile(groupfile); }
+
+ string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; }
+ m->mothurConvert(temp, cutoff);
+
+ if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; }
+
+ if (namefile == "") {
+ vector<string> files; files.push_back(treefile);
+ parser.getNameFile(files);
+ }
+
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+int ClassifyTreeCommand::execute(){
+ try {
+
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+ cout.setf(ios::fixed, ios::floatfield); cout.setf(ios::showpoint);
+
+ int start = time(NULL);
+
+ /***************************************************/
+ // reading tree info //
+ /***************************************************/
+ m->setTreeFile(treefile);
+ if (groupfile != "") {
+ //read in group map info.
+ tmap = new TreeMap(groupfile);
+ tmap->readMap();
+ }else{ //fake out by putting everyone in one group
+ Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
+ tmap = new TreeMap();
+
+ for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
+ }
+
+ if (namefile != "") { readNamesFile(); }
+
+ read = new ReadNewickTree(treefile);
+ int readOk = read->read(tmap);
+
+ if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
+
+ read->AssembleTrees();
+ vector<Tree*> T = read->getTrees();
+ Tree* outputTree = T[0];
+ delete read;
+
+ //make sure all files match
+ //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
+ int numNamesInTree;
+ if (namefile != "") {
+ if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
+ else { numNamesInTree = m->Treenames.size(); }
+ }else { numNamesInTree = m->Treenames.size(); }
+
+
+ //output any names that are in group file but not in tree
+ if (numNamesInTree < tmap->getNumSeqs()) {
+ for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
+ //is that name in the tree?
+ int count = 0;
+ for (int j = 0; j < m->Treenames.size(); j++) {
+ if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
+ count++;
+ }
+
+ if (m->control_pressed) {
+ delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
+ m->clearGroups();
+ return 0;
+ }
+
+ //then you did not find it so report it
+ if (count == m->Treenames.size()) {
+ //if it is in your namefile then don't remove
+ map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
+
+ if (it == nameMap.end()) {
+ m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
+ tmap->removeSeq(tmap->namesOfSeqs[i]);
+ i--; //need this because removeSeq removes name from namesOfSeqs
+ }
+ }
+ }
+ }
+
+ if (m->control_pressed) { delete outputTree; delete tmap; return 0; }
+
+ readTaxonomyFile();
+
+
+ /***************************************************/
+ // get concensus taxonomies //
+ /***************************************************/
+ getClassifications(outputTree);
+ delete outputTree; delete tmap;
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ //set tree file as new current treefile
+ if (treefile != "") {
+ string current = "";
+ itTypes = outputTypes.find("tree");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTreeFile(current); }
+ }
+ }
+
+ m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to find the concensus taxonomies."); m->mothurOutEndLine();
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "execute");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+//traverse tree finding concensus taxonomy at each node
+//label node with a number to relate to output summary file
+//report all concensus taxonomies to file
+int ClassifyTreeCommand::getClassifications(Tree*& T){
+ try {
+
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(treefile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(treefile)) + "taxonomy.summary";
+ outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+ out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
+
+ //print headings
+ out << "TreeNode\t";
+ if (groupfile != "") { out << "Group\t"; }
+ out << "NumRep\tTaxonomy" << endl;
+
+ string treeOutputDir = outputDir;
+ if (outputDir == "") { treeOutputDir += m->hasPath(treefile); }
+ string outputTreeFileName = treeOutputDir + m->getRootName(m->getSimpleName(treefile)) + "taxonomy.tre";
+
+ //create a map from tree node index to names of descendants, save time later
+ map<int, map<string, set<string> > > nodeToDescendants; //node# -> (groupName -> groupMembers)
+ for (int i = 0; i < T->getNumNodes(); i++) {
+ if (m->control_pressed) { return 0; }
+
+ nodeToDescendants[i] = getDescendantList(T, i, nodeToDescendants);
+ }
+
+ //for each node
+ for (int i = T->getNumLeaves(); i < T->getNumNodes(); i++) {
+
+ if (m->control_pressed) { out.close(); return 0; }
+
+ string tax = "not classifed";
+ int size;
+ if (groupfile != "") {
+ for (map<string, set<string> >::iterator itGroups = nodeToDescendants[i].begin(); itGroups != nodeToDescendants[i].end(); itGroups++) {
+ if (itGroups->first != "AllGroups") {
+ tax = getTaxonomy(itGroups->second, size);
+ out << (i+1) << '\t' << itGroups->first << '\t' << size << '\t' << tax << endl;
+ }
+ }
+ }else {
+ string group = "AllGroups";
+ tax = getTaxonomy(nodeToDescendants[i][group], size);
+ out << (i+1) << '\t' << size << '\t' << tax << endl;
+ }
+
+ T->tree[i].setLabel((i+1));
+ }
+ out.close();
+
+ ofstream outTree;
+ m->openOutputFile(outputTreeFileName, outTree);
+ outputNames.push_back(outputTreeFileName); outputTypes["tree"].push_back(outputTreeFileName);
+ T->print(outTree, "both");
+ outTree.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "GetConcensusTaxonomies");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string ClassifyTreeCommand::getTaxonomy(set<string> names, int& size) {
+ try{
+ string conTax = "";
+ size = 0;
+
+ //create a tree containing sequences from this bin
+ PhyloTree* phylo = new PhyloTree();
+
+ for (set<string>::iterator it = names.begin(); it != names.end(); it++) {
+
+
+ //if namesfile include the names
+ if (namefile != "") {
+
+ //is this sequence in the name file - namemap maps seqName -> repSeqName
+ map<string, string>::iterator it2 = nameMap.find(*it);
+
+ if (it2 == nameMap.end()) { //this name is not in name file, skip it
+ m->mothurOut((*it) + " is not in your name file. I will not include it in the consensus."); m->mothurOutEndLine();
+ }else{
+
+ //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
+ map<string, string>::iterator itTax = taxMap.find((it2->second));
+
+ if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
+
+ if ((*it) != (it2->second)) { m->mothurOut((*it) + " is represented by " + it2->second + " and is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
+ else { m->mothurOut((*it) + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
+ }else{
+ //add seq to tree
+ int num = nameCount[(*it)]; // we know its there since we found it in nameMap
+ for (int i = 0; i < num; i++) { phylo->addSeqToTree((*it)+toString(i), it2->second); }
+ size += num;
+ }
+ }
+
+ }else{
+ //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
+ map<string, string>::iterator itTax = taxMap.find((*it));
+
+ if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
+ m->mothurOut((*it) + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine();
+ }else{
+ //add seq to tree
+ phylo->addSeqToTree((*it), itTax->second);
+ size++;
+ }
+ }
+
+ if (m->control_pressed) { delete phylo; return conTax; }
+
+ }
+
+ //build tree
+ phylo->assignHeirarchyIDs(0);
+
+ TaxNode currentNode = phylo->get(0);
+ int myLevel = 0;
+ //at each level
+ while (currentNode.children.size() != 0) { //you still have more to explore
+
+ TaxNode bestChild;
+ int bestChildSize = 0;
+
+ //go through children
+ for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
+
+ TaxNode temp = phylo->get(itChild->second);
+
+ //select child with largest accesions - most seqs assigned to it
+ if (temp.accessions.size() > bestChildSize) {
+ bestChild = phylo->get(itChild->second);
+ bestChildSize = temp.accessions.size();
+ }
+
+ }
+
+ //is this taxonomy above cutoff
+ int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
+
+ if (consensusConfidence >= cutoff) { //if yes, add it
+ conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
+ myLevel++;
+ }else{ //if no, quit
+ break;
+ }
+
+ //move down a level
+ currentNode = bestChild;
+ }
+
+ if (myLevel != phylo->getMaxLevel()) {
+ while (myLevel != phylo->getMaxLevel()) {
+ conTax += "unclassified;";
+ myLevel++;
+ }
+ }
+ if (conTax == "") { conTax = "no_consensus;"; }
+
+ delete phylo;
+
+ return conTax;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "getTaxonomy");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+map<string, set<string> > ClassifyTreeCommand::getDescendantList(Tree*& T, int i, map<int, map<string, set<string> > > descendants){
+ try {
+ map<string ,set<string> > names;
+
+ map<string ,set<string> >::iterator it;
+ map<string ,set<string> >::iterator it2;
+
+ int lc = T->tree[i].getLChild();
+ int rc = T->tree[i].getRChild();
+
+ if (lc == -1) { //you are a leaf your only descendant is yourself
+ string group = tmap->getGroup(T->tree[i].getName());
+ set<string> mynames; mynames.insert(T->tree[i].getName());
+ names[group] = mynames; //mygroup -> me
+ names["AllGroups"] = mynames;
+ }else{ //your descedants are the combination of your childrens descendants
+ names = descendants[lc];
+ for (it = descendants[rc].begin(); it != descendants[rc].end(); it++) {
+ it2 = names.find(it->first); //do we already have this group
+ if (it2 == names.end()) { //nope, so add it
+ names[it->first] = it->second;
+ }else {
+ for (set<string>::iterator it3 = (it->second).begin(); it3 != (it->second).end(); it3++) {
+ names[it->first].insert(*it3);
+ }
+ }
+
+ }
+ }
+
+ return names;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "getDescendantList");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int ClassifyTreeCommand::readTaxonomyFile() {
+ try {
+
+ ifstream in;
+ m->openInputFile(taxonomyfile, in);
+
+ string name, tax;
+
+ while(!in.eof()){
+ in >> name >> tax;
+ m->gobble(in);
+
+ //are there confidence scores, if so remove them
+ if (tax.find_first_of('(') != -1) { m->removeConfidences(tax); }
+
+ taxMap[name] = tax;
+
+ if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
+ }
+ in.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "readTaxonomyFile");
+ exit(1);
+ }
+}
+
+/*****************************************************************/
+int ClassifyTreeCommand::readNamesFile() {
+ try {
+ ifstream inNames;
+ m->openInputFile(namefile, inNames);
+
+ string name, names;
+
+ while(!inNames.eof()){
+ inNames >> name; //read from first column A
+ inNames >> names; //read from second column A,B,C,D
+ m->gobble(inNames);
+
+ //parse names into vector
+ vector<string> theseNames;
+ m->splitAtComma(names, theseNames);
+
+ for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = name; }
+ nameCount[name] = theseNames.size();
+
+ if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
+ }
+ inNames.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "readNamesFile");
+ exit(1);
+ }
+}
+
+/*****************************************************************/
+
+
--- /dev/null
+#ifndef Mothur_classifytreecommand_h
+#define Mothur_classifytreecommand_h
+
+//
+// classifytreecommand.h
+// Mothur
+//
+// Created by Sarah Westcott on 2/20/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "command.hpp"
+#include "readtree.h"
+#include "treemap.h"
+
+class ClassifyTreeCommand : public Command {
+public:
+ ClassifyTreeCommand(string);
+ ClassifyTreeCommand();
+ ~ClassifyTreeCommand(){}
+
+ vector<string> setParameters();
+ string getCommandName() { return "classify.tree"; }
+ string getCommandCategory() { return "Phylotype Analysis"; }
+ string getHelpString();
+ string getCitation() { return "http://www.mothur.org/wiki/Classify.tree"; }
+ string getDescription() { return "Find the concensus taxonomy for the descendant of each tree node"; }
+
+ int execute();
+ void help() { m->mothurOut(getHelpString()); }
+
+private:
+ ReadTree* read;
+ TreeMap* tmap;
+ string treefile, taxonomyfile, groupfile, namefile, outputDir;
+ bool abort;
+ vector<string> outputNames;
+ int numUniquesInName, cutoff;
+ map<string, string> nameMap;
+ map<string, int> nameCount;
+ map<string, string> taxMap;
+
+ int getClassifications(Tree*&);
+ map<string, set<string> > getDescendantList(Tree*&, int, map<int, map<string, set<string> > >);
+ string getTaxonomy(set<string>, int&);
+ int readNamesFile();
+ int readTaxonomyFile();
+
+};
+
+
+
+#endif
try {
//update location of seqs in smallRow since they move to smallCol now
for (int i = 0; i < dMatrix.size(); i++) {
- cout << "row = " << i << '\t';
+ m->mothurOut("row = " + toString(i) + "\t");
for (int j = 0; j < dMatrix[i].size(); j++) {
- cout << dMatrix[i][j] << '\t';
+ m->mothurOut(toString(dMatrix[i][j]) + "\t");
}
- cout << endl;
+ m->mothurOutEndLine();
}
}
catch(exception& e) {
*/
#include "clustersplitcommand.h"
-#include "readcluster.h"
-#include "splitmatrix.h"
-#include "readphylip.h"
-#include "readcolumn.h"
-#include "readmatrix.hpp"
-#include "inputdata.h"
+
//**********************************************************************************************************************
MPI_Barrier(MPI_COMM_WORLD);
#else
-
+ ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
//sanity check
if (processors > distName.size()) { processors = distName.size(); }
if(processors == 1){
listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
}else{
-
- //cout << processors << '\t' << distName.size() << endl;
- vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
- dividedNames.resize(processors);
-
- //for each file group figure out which process will complete it
- //want to divide the load intelligently so the big files are spread between processes
- for (int i = 0; i < distName.size(); i++) {
- //cout << i << endl;
- int processToAssign = (i+1) % processors;
- if (processToAssign == 0) { processToAssign = processors; }
-
- dividedNames[(processToAssign-1)].push_back(distName[i]);
- }
-
- //not lets reverse the order of ever other process, so we balance big files running with little ones
- for (int i = 0; i < processors; i++) {
- //cout << i << endl;
- int remainder = ((i+1) % processors);
- if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
- }
-
- createProcesses(dividedNames);
-
- if (m->control_pressed) { return 0; }
-
- //get list of list file names from each process
- for(int i=0;i<processors;i++){
- string filename = toString(processIDS[i]) + ".temp";
- ifstream in;
- m->openInputFile(filename, in);
-
- in >> tag; m->gobble(in);
-
- while(!in.eof()) {
- string tempName;
- in >> tempName; m->gobble(in);
- listFileNames.push_back(tempName);
- }
- in.close();
- m->mothurRemove((toString(processIDS[i]) + ".temp"));
-
- //get labels
- filename = toString(processIDS[i]) + ".temp.labels";
- ifstream in2;
- m->openInputFile(filename, in2);
-
- float tempCutoff;
- in2 >> tempCutoff; m->gobble(in2);
- if (tempCutoff < cutoff) { cutoff = tempCutoff; }
-
- while(!in2.eof()) {
- string tempName;
- in2 >> tempName; m->gobble(in2);
- if (labels.count(tempName) == 0) { labels.insert(tempName); }
- }
- in2.close();
- m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
- }
- }
+ listFileNames = createProcesses(distName, labels);
+ }
#else
listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
#endif
}
}
//**********************************************************************************************************************
-int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
+vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
try {
+
+ vector<string> listFiles;
+ vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
+ dividedNames.resize(processors);
+
+ //for each file group figure out which process will complete it
+ //want to divide the load intelligently so the big files are spread between processes
+ for (int i = 0; i < distName.size(); i++) {
+ //cout << i << endl;
+ int processToAssign = (i+1) % processors;
+ if (processToAssign == 0) { processToAssign = processors; }
+
+ dividedNames[(processToAssign-1)].push_back(distName[i]);
+ if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
+ }
+
+ //not lets reverse the order of ever other process, so we balance big files running with little ones
+ for (int i = 0; i < processors; i++) {
+ //cout << i << endl;
+ int remainder = ((i+1) % processors);
+ if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
+ }
+
+ if (m->control_pressed) { return listFiles; }
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- int exitCommand = 1;
+ int process = 1;
processIDS.clear();
//loop through and create all the processes you want
}
}
+ //do your part
+ listFiles = cluster(dividedNames[0], labels);
+
//force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
+ for (int i=0;i< processIDS.size();i++) {
int temp = processIDS[i];
wait(&temp);
}
+
+ //get list of list file names from each process
+ for(int i=0;i<processIDS.size();i++){
+ string filename = toString(processIDS[i]) + ".temp";
+ ifstream in;
+ m->openInputFile(filename, in);
+
+ in >> tag; m->gobble(in);
+
+ while(!in.eof()) {
+ string tempName;
+ in >> tempName; m->gobble(in);
+ listFiles.push_back(tempName);
+ }
+ in.close();
+ m->mothurRemove((toString(processIDS[i]) + ".temp"));
+
+ //get labels
+ filename = toString(processIDS[i]) + ".temp.labels";
+ ifstream in2;
+ m->openInputFile(filename, in2);
+
+ float tempCutoff;
+ in2 >> tempCutoff; m->gobble(in2);
+ if (tempCutoff < cutoff) { cutoff = tempCutoff; }
+
+ while(!in2.eof()) {
+ string tempName;
+ in2 >> tempName; m->gobble(in2);
+ if (labels.count(tempName) == 0) { labels.insert(tempName); }
+ }
+ in2.close();
+ m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
+ }
+
+
+ #else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the clusterData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to allow both threads to add labels.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
- return exitCommand;
+ vector<clusterData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=1; i<processors; i++ ){
+ // Allocate memory for thread data.
+ clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
+ pDataArray.push_back(tempCluster);
+ processIDS.push_back(i);
+
+ //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
+
+ }
+
+ //do your part
+ listFiles = cluster(dividedNames[0], labels);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ //get tag
+ tag = pDataArray[i]->tag;
+ //get listfiles created
+ for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
+ //get labels
+ set<string>::iterator it;
+ for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
+ //check cutoff
+ if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
#endif
+
+ return listFiles;
}
catch(exception& e) {
vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
try {
- Cluster* cluster;
- SparseMatrix* matrix;
- ListVector* list;
- ListVector oldList;
- RAbundVector* rabund;
vector<string> listFileNames;
-
double smallestCutoff = cutoff;
//cluster each distance file
for (int i = 0; i < distNames.size(); i++) {
+
+ Cluster* cluster = NULL;
+ SparseMatrix* matrix = NULL;
+ ListVector* list = NULL;
+ ListVector oldList;
+ RAbundVector* rabund = NULL;
+
if (m->control_pressed) { return listFileNames; }
string thisNamefile = distNames[i].begin()->second;
oldList = *list;
matrix = read->getMatrix();
- delete read;
- delete nameMap;
+ delete read; read = NULL;
+ delete nameMap; nameMap = NULL;
#ifdef USE_MPI
}
delete matrix; delete list; delete cluster; delete rabund;
+ matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
listFile.close();
if (m->control_pressed) { //clean up
#include "listvector.hpp"
#include "cluster.hpp"
#include "sparsematrix.hpp"
-
+#include "readcluster.h"
+#include "splitmatrix.h"
+#include "readphylip.h"
+#include "readcolumn.h"
+#include "readmatrix.hpp"
+#include "inputdata.h"
+#include "clustercommand.h"
class ClusterSplitCommand : public Command {
ofstream outList, outRabund, outSabund;
void printData(ListVector*);
- int createProcesses(vector < vector < map<string, string> > >);
+ vector<string> createProcesses(vector< map<string, string> >, set<string>&);
vector<string> cluster(vector< map<string, string> >, set<string>&);
int mergeLists(vector<string>, map<float, int>, ListVector*);
map<float, int> completeListFile(vector<string>, string, set<string>&, ListVector*&);
int createMergedDistanceFile(vector< map<string, string> >);
};
+/////////////////not working for Windows////////////////////////////////////////////////////////////
+// getting an access violation error. This is most likely caused by the
+// threads stepping on eachother's structures, as I can run the thread function and the cluster fuction
+// in separately without errors occuring. I suspect it may be in the use of the
+// static class mothurOut, but I can't pinpoint the problem. All other objects are made new
+// within the thread. MothurOut is used by almost all the classes in mothur, so if this was
+// really the cause I would expect to see all the windows threaded commands to have issues, but not
+// all do. So far, shhh.flows and trim.flows have similiar problems. Other thoughts, could it have
+// anything to do with mothur's use of copy constructors in many of our data structures. ie. listvector
+// is copied by nameassignment and passed to read which passes to the thread? -westcott 2-8-12
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct clusterData {
+ set<string> labels;
+ vector < map<string, string> > distNames;
+ string method;
+ MothurOut* m;
+ double cutoff, precision;
+ string tag, outputDir;
+ vector<string> listFiles;
+ bool hard;
+ int length, threadID;
+
+
+ clusterData(){}
+ clusterData(vector < map<string, string> > dv, MothurOut* mout, double cu, string me, string ou, bool hd, double pre, int len, int th) {
+ distNames = dv;
+ m = mout;
+ cutoff = cu;
+ method = me;
+ outputDir = ou;
+ hard = hd;
+ precision = pre;
+ length = len;
+ threadID = th;
+ }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+static DWORD WINAPI MyClusterThreadFunction(LPVOID lpParam){
+ clusterData* pDataArray;
+ pDataArray = (clusterData*)lpParam;
+
+ try {
+ cout << "starting " << endl;
+
+ double smallestCutoff = pDataArray->cutoff;
+
+ //cluster each distance file
+ for (int i = 0; i < pDataArray->distNames.size(); i++) {
+
+ Cluster* mycluster = NULL;
+ SparseMatrix* mymatrix = NULL;
+ ListVector* mylist = NULL;
+ ListVector myoldList;
+ RAbundVector* myrabund = NULL;
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ string thisNamefile = pDataArray->distNames[i].begin()->second;
+ string thisDistFile = pDataArray->distNames[i].begin()->first;
+ cout << thisNamefile << '\t' << thisDistFile << endl;
+ pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Reading " + thisDistFile); pDataArray->m->mothurOutEndLine();
+
+ ReadMatrix* myread = new ReadColumnMatrix(thisDistFile);
+ myread->setCutoff(pDataArray->cutoff);
+ NameAssignment* mynameMap = new NameAssignment(thisNamefile);
+ mynameMap->readMap();
+ cout << "done reading " << thisNamefile << endl;
+ myread->read(mynameMap);
+ cout << "done reading " << thisDistFile << endl;
+ if (pDataArray->m->control_pressed) { delete myread; delete mynameMap; break; }
+
+ mylist = myread->getListVector();
+ myoldList = *mylist;
+ mymatrix = myread->getMatrix();
+ cout << "here" << endl;
+ delete myread; myread = NULL;
+ delete mynameMap; mynameMap = NULL;
+
+ pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Clustering " + thisDistFile); pDataArray->m->mothurOutEndLine();
+
+ myrabund = new RAbundVector(mylist->getRAbundVector());
+ cout << "here" << endl;
+ //create cluster
+ if (pDataArray->method == "furthest") { mycluster = new CompleteLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); }
+ else if(pDataArray->method == "nearest"){ mycluster = new SingleLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); }
+ else if(pDataArray->method == "average"){ mycluster = new AverageLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); }
+ pDataArray->tag = mycluster->getTag();
+ cout << "here" << endl;
+ if (pDataArray->outputDir == "") { pDataArray->outputDir += pDataArray->m->hasPath(thisDistFile); }
+ string fileroot = pDataArray->outputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisDistFile));
+ cout << "here" << endl;
+ ofstream listFile;
+ pDataArray->m->openOutputFile(fileroot+ pDataArray->tag + ".list", listFile);
+ cout << "here" << endl;
+ pDataArray->listFiles.push_back(fileroot+ pDataArray->tag + ".list");
+
+ float previousDist = 0.00000;
+ float rndPreviousDist = 0.00000;
+
+ myoldList = *mylist;
+
+ bool print_start = true;
+ int start = time(NULL);
+ double saveCutoff = pDataArray->cutoff;
+
+ while (mymatrix->getSmallDist() < pDataArray->cutoff && mymatrix->getNNodes() > 0){
+
+ if (pDataArray->m->control_pressed) { //clean up
+ delete mymatrix; delete mylist; delete mycluster; delete myrabund;
+ listFile.close();
+ for (int i = 0; i < pDataArray->listFiles.size(); i++) { pDataArray->m->mothurRemove(pDataArray->listFiles[i]); }
+ pDataArray->listFiles.clear(); break;
+ }
+
+ mycluster->update(saveCutoff);
+
+ float dist = mymatrix->getSmallDist();
+ float rndDist;
+ if (pDataArray->hard) {
+ rndDist = pDataArray->m->ceilDist(dist, pDataArray->precision);
+ }else{
+ rndDist = pDataArray->m->roundDist(dist, pDataArray->precision);
+ }
+
+ if(previousDist <= 0.0000 && dist != previousDist){
+ myoldList.setLabel("unique");
+ myoldList.print(listFile);
+ if (pDataArray->labels.count("unique") == 0) { pDataArray->labels.insert("unique"); }
+ }
+ else if(rndDist != rndPreviousDist){
+ myoldList.setLabel(toString(rndPreviousDist, pDataArray->length-1));
+ myoldList.print(listFile);
+ if (pDataArray->labels.count(toString(rndPreviousDist, pDataArray->length-1)) == 0) { pDataArray->labels.insert(toString(rndPreviousDist, pDataArray->length-1)); }
+ }
+
+ previousDist = dist;
+ rndPreviousDist = rndDist;
+ myoldList = *mylist;
+ }
+
+ cout << "here2" << endl;
+ if(previousDist <= 0.0000){
+ myoldList.setLabel("unique");
+ myoldList.print(listFile);
+ if (pDataArray->labels.count("unique") == 0) { pDataArray->labels.insert("unique"); }
+ }
+ else if(rndPreviousDist<pDataArray->cutoff){
+ myoldList.setLabel(toString(rndPreviousDist, pDataArray->length-1));
+ myoldList.print(listFile);
+ if (pDataArray->labels.count(toString(rndPreviousDist, pDataArray->length-1)) == 0) { pDataArray->labels.insert(toString(rndPreviousDist, pDataArray->length-1)); }
+ }
+
+ delete mymatrix; delete mylist; delete mycluster; delete myrabund;
+ mymatrix = NULL; mylist = NULL; mycluster = NULL; myrabund = NULL;
+ listFile.close();
+
+ if (pDataArray->m->control_pressed) { //clean up
+ for (int i = 0; i < pDataArray->listFiles.size(); i++) { pDataArray->m->mothurRemove(pDataArray->listFiles[i]); }
+ pDataArray->listFiles.clear(); break;
+ }
+ cout << "here3" << endl;
+ pDataArray->m->mothurRemove(thisDistFile);
+ pDataArray->m->mothurRemove(thisNamefile);
+ cout << "here4" << endl;
+ if (saveCutoff != pDataArray->cutoff) {
+ if (pDataArray->hard) { saveCutoff = pDataArray->m->ceilDist(saveCutoff, pDataArray->precision); }
+ else { saveCutoff = pDataArray->m->roundDist(saveCutoff, pDataArray->precision); }
+
+ pDataArray->m->mothurOut("Cutoff was " + toString(pDataArray->cutoff) + " changed cutoff to " + toString(saveCutoff)); pDataArray->m->mothurOutEndLine();
+ }
+ cout << "here5" << endl;
+ if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
+ }
+
+ pDataArray->cutoff = smallestCutoff;
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "ClusterSplitCommand", "MyClusterThreadFunction");
+ exit(1);
+ }
+}
+#endif
+
+
+
+
#endif
#include "summaryqualcommand.h"
#include "otuassociationcommand.h"
#include "sortseqscommand.h"
+#include "classifytreecommand.h"
/*******************************************************/
commands["shhh.seqs"] = "shhh.seqs";
commands["otu.association"] = "otu.association";
commands["sort.seqs"] = "sort.seqs";
+ commands["classify.tree"] = "classify.tree";
commands["quit"] = "MPIEnabled";
}
else if(commandName == "shhh.seqs") { command = new ShhhSeqsCommand(optionString); }
else if(commandName == "otu.association") { command = new OTUAssociationCommand(optionString); }
else if(commandName == "sort.seqs") { command = new SortSeqsCommand(optionString); }
+ else if(commandName == "classify.tree") { command = new ClassifyTreeCommand(optionString); }
else { command = new NoCommand(optionString); }
return command;
else if(commandName == "shhh.seqs") { pipecommand = new ShhhSeqsCommand(optionString); }
else if(commandName == "otu.association") { pipecommand = new OTUAssociationCommand(optionString); }
else if(commandName == "sort.seqs") { pipecommand = new SortSeqsCommand(optionString); }
+ else if(commandName == "classify.tree") { pipecommand = new ClassifyTreeCommand(optionString); }
else { pipecommand = new NoCommand(optionString); }
return pipecommand;
else if(commandName == "shhh.seqs") { shellcommand = new ShhhSeqsCommand(); }
else if(commandName == "otu.association") { shellcommand = new OTUAssociationCommand(); }
else if(commandName == "sort.seqs") { shellcommand = new SortSeqsCommand(); }
+ else if(commandName == "classify.tree") { shellcommand = new ClassifyTreeCommand(); }
else { shellcommand = new NoCommand(); }
return shellcommand;
exit(1);
}
}
-/***********************************************************************/
+***********************************************************************/
bool CommandFactory::isValidCommand(string command) {
try {
//open input file
ifstream in;
m->openInputFile(namefile, in);
-
+
+ int total = 0;
while (!in.eof()) {
if (m->control_pressed) { break; }
out << firstCol << '\t' << names.size() << endl;
}
-
+ total += names.size();
}
in.close();
if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
+ m->mothurOutEndLine();
+ m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine();
m->mothurOutEndLine();
m->mothurOut("Output File Name: "); m->mothurOutEndLine();
m->mothurOut(outputFileName); m->mothurOutEndLine();
public:
Database();
- Database(const Database& db) : numSeqs(db.numSeqs), longest(db.longest), searchScore(db.searchScore), results(db.results), Scores(db.Scores) { m = MothurOut::getInstance(); }
virtual ~Database();
virtual void generateDB() = 0;
virtual void addSequence(Sequence) = 0; //add sequence to search engine
}
}
-//***************************************************************************************************************
+***************************************************************************************************************
//used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
int DeCalculator::findLargestContrib(vector<int> seen) {
try{
exit(1);
}
}
-//***************************************************************************************************************
+***************************************************************************************************************
void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
try{
exit(1);
}
}
-/**************************************************************************************************
+**************************************************************************************************
int DistanceCommand::convertToLowerTriangle(string outputFile) {
try{
exit(1);
}
}
-/**************************************************************************************************/
+**************************************************************************************************/
//its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
//but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
//also check to make sure the 2 files have the same alignment length.
#include "onegapignore.h"
-/**************************************************************************************************/
-DistanceDB::DistanceDB(const DistanceDB& ddb) : data(ddb.data), templateSeqsLength(ddb.templateSeqsLength), templateAligned(ddb.templateAligned), Database(ddb) {
- distCalculator = new oneGapIgnoreTermGapDist();
-}
/**************************************************************************************************/
DistanceDB::DistanceDB() : Database() {
try {
public:
DistanceDB();
- DistanceDB(const DistanceDB& ddb);
~DistanceDB() { delete distCalculator; }
void generateDB() {} //doesn't generate a search db
public:
eachGapDist() {}
- eachGapDist(const eachGapDist& ddb) {}
void calcDist(Sequence A, Sequence B){
int diff = 0;
exit(1);
}
}
-/***********************************************************************/
+***********************************************************************/
int HCluster::processFile() {
try {
string firstName, secondName;
public:
ignoreGaps() {}
- ignoreGaps(const ignoreGaps& ddb) {}
void calcDist(Sequence A, Sequence B){
int diff = 0;
public:
KmerDB(string, int);
- KmerDB(const KmerDB& kdb) : kmerSize(kdb.kmerSize), maxKmer(kdb.maxKmer), count(kdb.count), kmerDBName(kdb.kmerDBName), kmerLocations(kdb.kmerLocations), Database(kdb) {}
KmerDB();
~KmerDB();
}
}
-//***************************************************************************************************************
+***************************************************************************************************************
vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
try {
if(qvalues[i] < threshold){
m->mothurOut("Feature " + toString((i+1)) + " is significant, q = ");
cout << qvalues[i];
- m->mothurOutJustToLog(toString(pvalues[i])); m->mothurOutEndLine();
+ m->mothurOutJustToLog(toString(qvalues[i])); m->mothurOutEndLine();
}
}
vector<double> MothurMetastats::calc_qvalues(vector<double>& pValues) {
try {
+ /* cout << "x <- c(" << pValues[0];
+ for (int l = 1; l < pValues.size(); l++){
+ cout << ", " << pValues[l];
+ }
+ cout << ")\n";*/
+
int numRows = pValues.size();
vector<double> qvalues(numRows, 0.0);
if (pid == 0) { //only one process should output to screen
#endif
- cout << output;
out << output;
+ logger() << output;
#ifdef USE_MPI
}
if (pid == 0) { //only one process should output to screen
#endif
- cout << endl;
out << endl;
+ logger() << endl;
#ifdef USE_MPI
}
if (pid == 0) { //only one process should output to screen
#endif
- cout << output;
+
out << output;
outputFile << output;
+ logger() << output;
#ifdef USE_MPI
}
#endif
+
}
catch(exception& e) {
errorOut(e, "MothurOut", "MothurOut");
if (pid == 0) { //only one process should output to screen
#endif
- cout << endl;
out << endl;
outputFile << endl;
+ logger() << endl;
#ifdef USE_MPI
}
}else if (path[(pos-1)] == '/') { //you want the current working dir ./
path = path.substr(0, pos);
}else if (pos == 1) { break; //you are at the end
- }else { cout << "cannot resolve path for " << fileName << endl; return fileName; }
+ }else { mothurOut("cannot resolve path for " + fileName + "\n"); return fileName; }
}
for (int i = index; i >= 0; i--) {
}else if (path[(pos-1)] == '\\') { //you want the current working dir ./
path = path.substr(0, pos);
}else if (pos == 1) { break; //you are at the end
- }else { cout << "cannot resolve path for " << fileName << endl; return fileName; }
+ }else { mothurOut("cannot resolve path for " + fileName + "\n"); return fileName; }
}
for (int i = index; i >= 0; i--) {
while(isspace(d) && (d != in.eof())) { d=in.get(); count++;}
}
positions.push_back(count-1);
- cout << count-1 << endl;
+ //cout << count-1 << endl;
}
in.close();
#include "mothur.h"
+/***********************************************/
+struct logger {
+
+ logger() {}
+ ~logger() {}
+
+ template< class T >
+ logger& operator <<( const T& o ) {
+ cout << o; return *this;
+ }
+
+ logger& operator<<(ostream& (*m)(ostream&) ) {
+ cout << m; return *this;
+ }
+
+};
/***********************************************/
class MothurOut {
for(int i=0;i<numRefSeqs;i++){
if(restricted[i] == 0){
- if(leftDiffs[i][l] < singleLeft[l] && sequences[i].frequency || (leftDiffs[i][l] == singleLeft[l] && sequences[i].frequency > sequences[bestLeft[l]].frequency)){
+ if(((leftDiffs[i][l] < singleLeft[l]) && sequences[i].frequency) || ((leftDiffs[i][l] == singleLeft[l]) && (sequences[i].frequency > sequences[bestLeft[l]].frequency))){
singleLeft[l] = leftDiffs[i][l];
bestLeft[l] = i;
}
for(int i=0;i<numRefSeqs;i++){
if(restricted[i] == 0){
- if(rightDiffs[i][l] < singleRight[l] && sequences[i].frequency || (rightDiffs[i][l] == singleRight[l] && sequences[i].frequency > sequences[bestRight[l]].frequency)){
+ if((rightDiffs[i][l] < singleRight[l] && sequences[i].frequency) || ((rightDiffs[i][l] == singleRight[l] && sequences[i].frequency > sequences[bestRight[l]].frequency))){
singleRight[l] = rightDiffs[i][l];
bestRight[l] = i;
}
if(restricted[i] == 0){
int delta = leftDiffs[i][y] - leftDiffs[i][x];
- if(delta < minDelta[x][y] || delta == minDelta[x][y] && sequences[i].frequency > sequences[minDeltaSeq[x][y]].frequency){
+ if(delta < minDelta[x][y] || (delta == minDelta[x][y] && sequences[i].frequency > sequences[minDeltaSeq[x][y]].frequency)){
minDelta[x][y] = delta;
minDeltaSeq[x][y] = i;
}
public:
oneGapDist() {}
- oneGapDist(const oneGapDist& ddb) {}
void calcDist(Sequence A, Sequence B){
public:
oneGapIgnoreTermGapDist() {}
- oneGapIgnoreTermGapDist(const oneGapIgnoreTermGapDist& ddb) {}
void calcDist(Sequence A, Sequence B){
//initialize Dscore
for (int i=0; i<globaldata->Groups.size(); i++) { DScore[globaldata->Groups[i]] = 0.0; }
- /********************************************************
+ ********************************************************
//calculate a D value for each group
for(int v=0;v<treeNodes.size();v++){
exit(1);
}
}
-/**************************************************************************************************/
+**************************************************************************************************/
m->openInputFile(fastafile, inFasta);
//string firstCol, secondCol, nameString;
- length = 0;
+ set<int> lengths;
while (!inFasta.eof()) {
else{
seqPNode tempNode(itSize->second, seq, names[seq.getName()]);
alignSeqs.push_back(tempNode);
- if (seq.getAligned().length() > length) { length = seq.getAligned().length(); }
+ lengths.insert(seq.getAligned().length());
}
}else { //no names file, you are identical to yourself
seqPNode tempNode(1, seq, seq.getName());
alignSeqs.push_back(tempNode);
- if (seq.getAligned().length() > length) { length = seq.getAligned().length(); }
+ lengths.insert(seq.getAligned().length());
}
}
}
inFasta.close();
//inNames.close();
+
+ if (lengths.size() > 1) { m->control_pressed = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
+
return alignSeqs.size();
}
/**************************************************************************************************/
int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>& thisSeqs){
try {
- length = 0;
+ set<int> lengths;
alignSeqs.clear();
map<string, string>::iterator it;
bool error = false;
seqPNode tempNode(numReps, thisSeqs[i], it->second);
alignSeqs.push_back(tempNode);
- if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); }
+ lengths.insert(thisSeqs[i].getAligned().length());
}
}else { //no names file, you are identical to yourself
seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName());
alignSeqs.push_back(tempNode);
- if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); }
+ lengths.insert(thisSeqs[i].getAligned().length());
}
}
+ if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
+
//sanity check
if (error) { m->control_pressed = true; }
}
/*******************************************************
ReferenceDB::~ReferenceDB() { myInstance = NULL; }
-/*******************************************************/
+*******************************************************/
return 0;
}
-/**************************************************************************************************/
+**************************************************************************************************/
Sequence(string, string);
Sequence(ifstream&);
Sequence(istringstream&);
- Sequence(const Sequence& se) : name(se.name), unaligned(se.unaligned), aligned(se.aligned), pairwise(se.pairwise), numBases(se.numBases), startPos(se.startPos), endPos(se.endPos),
- alignmentLength(se.alignmentLength), isAligned(se.isAligned), longHomoPolymer(se.longHomoPolymer), ambigBases(se.ambigBases) { m = MothurOut::getInstance(); }
-
//these constructors just set the unaligned string to save space
Sequence(string, string, string);
Sequence(ifstream&, string);
exit(1);
}
}
-/***********************************************************************/
+***********************************************************************/
vector<SharedRAbundFloatVector*> SharedRAbundFloatVector::getSharedRAbundFloatVectors(){
try {
SharedUtil* util;
exit(1);
}
}
-/***********************************************************************/
+***********************************************************************/
SAbundVector SharedRAbundFloatVector::getSAbundVector() {
try {
exit(1);
}
}
-/***********************************************************************/
+***********************************************************************/
//this is not functional, not sure how to handle it yet, but I need the stub because it is a pure function
OrderVector SharedRAbundFloatVector::getOrderVector(map<string,int>* nameMap = NULL) {
try {
}
-/***********************************************************************/
+***********************************************************************/
//reads a shared file
SharedRAbundVector::SharedRAbundVector(ifstream& f) : DataVector(), maxRank(0), numBins(0), numSeqs(0) {
try {
#include "shhhercommand.h"
-#include "readcolumn.h"
-#include "readmatrix.hpp"
-#include "rabundvector.hpp"
-#include "sabundvector.hpp"
-#include "listvector.hpp"
-#include "cluster.hpp"
-#include "sparsematrix.hpp"
-#include <cfloat>
-
//**********************************************************************************************************************
vector<string> ShhherCommand::setParameters(){
try {
ShhherCommand::ShhherCommand(string option) {
try {
-
-#ifdef USE_MPI
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
- MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
-
- if(pid == 0){
-#endif
-
-
abort = false; calledHelp = false;
-
//allow user to run help
if(option == "help") { help(); abort = true; calledHelp = true; }
else if(option == "citation") { citation(); abort = true; calledHelp = true;}
m->openOutputFile(compositeNamesFileName, temp);
temp.close();
}
+
+ if(flowFilesFileName != "not found"){
+ string fName;
+
+ ifstream flowFilesFile;
+ m->openInputFile(flowFilesFileName, flowFilesFile);
+ while(flowFilesFile){
+ fName = m->getline(flowFilesFile);
+
+ //test if file is valid
+ ifstream in;
+ int ableToOpen = m->openInputFile(fName, in, "noerror");
+ in.close();
+ if (ableToOpen == 1) {
+ if (inputDir != "") { //default path is set
+ string tryPath = inputDir + fName;
+ m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ fName = tryPath;
+ }
+ }
+
+ if (ableToOpen == 1) {
+ if (m->getDefaultPath() != "") { //default path is set
+ string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
+ m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ fName = tryPath;
+ }
+ }
+
+ //if you can't open it its not in current working directory or inputDir, try mothur excutable location
+ if (ableToOpen == 1) {
+ string exepath = m->argv;
+ string tempPath = exepath;
+ for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
+ exepath = exepath.substr(0, (tempPath.find_last_of('m')));
+
+ string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
+ m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ fName = tryPath;
+ }
+
+ if (ableToOpen == 1) { m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine(); }
+ else { flowFileVector.push_back(fName); }
+ m->gobble(flowFilesFile);
+ }
+ flowFilesFile.close();
+ if (flowFileVector.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
+ }
+ else{
+ flowFileVector.push_back(flowFileName);
+ }
+
//if the user changes the output directory command factory will send this info to us in the output parameter
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
}
}
-
-#ifdef USE_MPI
- }
-#endif
-
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "ShhherCommand");
int tag = 1976;
MPI_Status status;
-
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+ MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
+
if(pid == 0){
for(int i=1;i<ncpus;i++){
getSingleLookUp(); if (m->control_pressed) { return 0; }
getJointLookUp(); if (m->control_pressed) { return 0; }
- vector<string> flowFileVector;
- if(flowFilesFileName != "not found"){
- string fName;
-
- ifstream flowFilesFile;
- m->openInputFile(flowFilesFileName, flowFilesFile);
- while(flowFilesFile){
- fName = m->getline(flowFilesFile);
- flowFileVector.push_back(fName);
- m->gobble(flowFilesFile);
- }
- }
- else{
- flowFileVector.push_back(flowFileName);
- }
int numFiles = flowFileVector.size();
for(int i=1;i<ncpus;i++){
exit(1);
}
}
-
/**************************************************************************************************/
string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
return fDistFileName;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "flowDistParentFork");
+ m->errorOut(e, "ShhherCommand", "flowDistMPI");
exit(1);
}
}
+/**************************************************************************************************/
+
+void ShhherCommand::getOTUData(string listFileName){
+ try {
+
+ ifstream listFile;
+ m->openInputFile(listFileName, listFile);
+ string label;
+
+ listFile >> label >> numOTUs;
+
+ otuData.assign(numSeqs, 0);
+ cumNumSeqs.assign(numOTUs, 0);
+ nSeqsPerOTU.assign(numOTUs, 0);
+ aaP.clear();aaP.resize(numOTUs);
+
+ seqNumber.clear();
+ aaI.clear();
+ seqIndex.clear();
+
+ string singleOTU = "";
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ listFile >> singleOTU;
+
+ istringstream otuString(singleOTU);
+
+ while(otuString){
+
+ string seqName = "";
+
+ for(int j=0;j<singleOTU.length();j++){
+ char letter = otuString.get();
+
+ if(letter != ','){
+ seqName += letter;
+ }
+ else{
+ map<string,int>::iterator nmIt = nameMap.find(seqName);
+ int index = nmIt->second;
+
+ nameMap.erase(nmIt);
+
+ otuData[index] = i;
+ nSeqsPerOTU[i]++;
+ aaP[i].push_back(index);
+ seqName = "";
+ }
+ }
+
+ map<string,int>::iterator nmIt = nameMap.find(seqName);
+
+ int index = nmIt->second;
+ nameMap.erase(nmIt);
+
+ otuData[index] = i;
+ nSeqsPerOTU[i]++;
+ aaP[i].push_back(index);
+
+ otuString.get();
+ }
+
+ sort(aaP[i].begin(), aaP[i].end());
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber.push_back(aaP[i][j]);
+ }
+ for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
+ aaP[i].push_back(0);
+ }
+
+
+ }
+
+ for(int i=1;i<numOTUs;i++){
+ cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
+ }
+ aaI = aaP;
+ seqIndex = seqNumber;
+
+ listFile.close();
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getOTUData");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::initPyroCluster(){
+ try{
+ if (numOTUs < processors) { processors = 1; }
+
+ dist.assign(numSeqs * numOTUs, 0);
+ change.assign(numOTUs, 1);
+ centroids.assign(numOTUs, -1);
+ weight.assign(numOTUs, 0);
+ singleTau.assign(numSeqs, 1.0);
+
+ nSeqsBreaks.assign(processors+1, 0);
+ nOTUsBreaks.assign(processors+1, 0);
+
+ nSeqsBreaks[0] = 0;
+ for(int i=0;i<processors;i++){
+ nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
+ nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
+ }
+ nSeqsBreaks[processors] = numSeqs;
+ nOTUsBreaks[processors] = numOTUs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "initPyroCluster");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::fill(){
+ try {
+ int index = 0;
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ cumNumSeqs[i] = index;
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber[index] = aaP[i][j];
+ seqIndex[index] = aaI[i][j];
+
+ index++;
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "fill");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::getFlowData(){
+ try{
+ ifstream flowFile;
+ m->openInputFile(flowFileName, flowFile);
+
+ string seqName;
+ seqNameVector.clear();
+ lengths.clear();
+ flowDataIntI.clear();
+ nameMap.clear();
+
+
+ int currentNumFlowCells;
+
+ float intensity;
+
+ flowFile >> numFlowCells;
+ int index = 0;//pcluster
+ while(!flowFile.eof()){
+
+ if (m->control_pressed) { break; }
+
+ flowFile >> seqName >> currentNumFlowCells;
+ lengths.push_back(currentNumFlowCells);
+
+ seqNameVector.push_back(seqName);
+ nameMap[seqName] = index++;//pcluster
+
+ for(int i=0;i<numFlowCells;i++){
+ flowFile >> intensity;
+ if(intensity > 9.99) { intensity = 9.99; }
+ int intI = int(100 * intensity + 0.0001);
+ flowDataIntI.push_back(intI);
+ }
+ m->gobble(flowFile);
+ }
+ flowFile.close();
+
+ numSeqs = seqNameVector.size();
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int iNumFlowCells = i * numFlowCells;
+ for(int j=lengths[i];j<numFlowCells;j++){
+ flowDataIntI[iNumFlowCells + j] = 0;
+ }
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getFlowData");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
+
+ try{
+ vector<double> newTau(numOTUs,0);
+ vector<double> norms(numSeqs, 0);
+ otuIndex.clear();
+ seqIndex.clear();
+ singleTau.clear();
+
+ for(int i=startSeq;i<stopSeq;i++){
+
+ if (m->control_pressed) { break; }
+
+ double offset = 1e8;
+ int indexOffset = i * numOTUs;
+
+ for(int j=0;j<numOTUs;j++){
+
+ if(weight[j] > MIN_WEIGHT && change[j] == 1){
+ dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
+ }
+ if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
+ offset = dist[indexOffset + j];
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if(weight[j] > MIN_WEIGHT){
+ newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
+ norms[i] += newTau[j];
+ }
+ else{
+ newTau[j] = 0.0;
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+
+ newTau[j] /= norms[i];
+
+ if(newTau[j] > MIN_TAU){
+ otuIndex.push_back(j);
+ seqIndex.push_back(i);
+ singleTau.push_back(newTau[j]);
+ }
+ }
+
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
+
+ try{
+
+ int total = 0;
+ vector<double> newTau(numOTUs,0);
+ vector<double> norms(numSeqs, 0);
+ nSeqsPerOTU.assign(numOTUs, 0);
+
+ for(int i=startSeq;i<stopSeq;i++){
+
+ if (m->control_pressed) { break; }
+
+ int indexOffset = i * numOTUs;
+
+ double offset = 1e8;
+
+ for(int j=0;j<numOTUs;j++){
+
+ if(weight[j] > MIN_WEIGHT && change[j] == 1){
+ dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
+ }
+
+ if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
+ offset = dist[indexOffset + j];
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if(weight[j] > MIN_WEIGHT){
+ newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
+ norms[i] += newTau[j];
+ }
+ else{
+ newTau[j] = 0.0;
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ newTau[j] /= norms[i];
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if(newTau[j] > MIN_TAU){
+
+ int oldTotal = total;
+
+ total++;
+
+ singleTau.resize(total, 0);
+ seqNumber.resize(total, 0);
+ seqIndex.resize(total, 0);
+
+ singleTau[oldTotal] = newTau[j];
+
+ aaP[j][nSeqsPerOTU[j]] = oldTotal;
+ aaI[j][nSeqsPerOTU[j]] = i;
+ nSeqsPerOTU[j]++;
+ }
+ }
+
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::setOTUs(){
+
+ try {
+ vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ int sIndex = seqIndex[index];
+ bigTauMatrix[sIndex * numOTUs + i] = tauValue;
+ }
+ }
+
+ for(int i=0;i<numSeqs;i++){
+ double maxTau = -1.0000;
+ int maxOTU = -1;
+ for(int j=0;j<numOTUs;j++){
+ if(bigTauMatrix[i * numOTUs + j] > maxTau){
+ maxTau = bigTauMatrix[i * numOTUs + j];
+ maxOTU = j;
+ }
+ }
+
+ otuData[i] = maxOTU;
+ }
+
+ nSeqsPerOTU.assign(numOTUs, 0);
+
+ for(int i=0;i<numSeqs;i++){
+ int index = otuData[i];
+
+ singleTau[i] = 1.0000;
+ dist[i] = 0.0000;
+
+ aaP[index][nSeqsPerOTU[index]] = i;
+ aaI[index][nSeqsPerOTU[index]] = i;
+
+ nSeqsPerOTU[index]++;
+ }
+ fill();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "setOTUs");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::getUniques(){
+ try{
+
+
+ numUniques = 0;
+ uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
+ uniqueCount.assign(numSeqs, 0); // anWeights
+ uniqueLengths.assign(numSeqs, 0);
+ mapSeqToUnique.assign(numSeqs, -1);
+ mapUniqueToSeq.assign(numSeqs, -1);
+
+ vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = 0;
+
+ vector<short> current(numFlowCells);
+ for(int j=0;j<numFlowCells;j++){
+ current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
+ }
+
+ for(int j=0;j<numUniques;j++){
+ int offset = j * numFlowCells;
+ bool toEnd = 1;
+
+ int shorterLength;
+ if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
+ else { shorterLength = uniqueLengths[j]; }
+
+ for(int k=0;k<shorterLength;k++){
+ if(current[k] != uniqueFlowgrams[offset + k]){
+ toEnd = 0;
+ break;
+ }
+ }
+
+ if(toEnd){
+ mapSeqToUnique[i] = j;
+ uniqueCount[j]++;
+ index = j;
+ if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
+ break;
+ }
+ index++;
+ }
+
+ if(index == numUniques){
+ uniqueLengths[numUniques] = lengths[i];
+ uniqueCount[numUniques] = 1;
+ mapSeqToUnique[i] = numUniques;//anMap
+ mapUniqueToSeq[numUniques] = i;//anF
+
+ for(int k=0;k<numFlowCells;k++){
+ uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
+ uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
+ }
+
+ numUniques++;
+ }
+ }
+ uniqueFlowDataIntI.resize(numFlowCells * numUniques);
+ uniqueLengths.resize(numUniques);
+
+ flowDataPrI.resize(numSeqs * numFlowCells, 0);
+ for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getUniques");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
+ try{
+ int minLength = lengths[mapSeqToUnique[seqA]];
+ if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
+
+ int ANumFlowCells = seqA * numFlowCells;
+ int BNumFlowCells = seqB * numFlowCells;
+
+ float dist = 0;
+
+ for(int i=0;i<minLength;i++){
+
+ if (m->control_pressed) { break; }
+
+ int flowAIntI = flowDataIntI[ANumFlowCells + i];
+ float flowAPrI = flowDataPrI[ANumFlowCells + i];
+
+ int flowBIntI = flowDataIntI[BNumFlowCells + i];
+ float flowBPrI = flowDataPrI[BNumFlowCells + i];
+ dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
+ }
+
+ dist /= (float) minLength;
+ return dist;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************/
+
+string ShhherCommand::cluster(string distFileName, string namesFileName){
+ try {
+
+ ReadMatrix* read = new ReadColumnMatrix(distFileName);
+ read->setCutoff(cutoff);
+
+ NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
+ clusterNameMap->readMap();
+ read->read(clusterNameMap);
+
+ ListVector* list = read->getListVector();
+ SparseMatrix* matrix = read->getMatrix();
+
+ delete read;
+ delete clusterNameMap;
+
+ RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
+
+ Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
+ string tag = cluster->getTag();
+
+ double clusterCutoff = cutoff;
+ while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
+
+ if (m->control_pressed) { break; }
+
+ cluster->update(clusterCutoff);
+ }
+
+ list->setLabel(toString(cutoff));
+
+ string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+ ofstream listFile;
+ m->openOutputFile(listFileName, listFile);
+ list->print(listFile);
+ listFile.close();
+
+ delete matrix; delete cluster; delete rabund; delete list;
+
+ return listFileName;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "cluster");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::calcCentroidsDriver(int start, int finish){
+
+ //this function gets the most likely homopolymer length at a flow position for a group of sequences
+ //within an otu
+
+ try{
+
+ for(int i=start;i<finish;i++){
+
+ if (m->control_pressed) { break; }
+
+ double count = 0;
+ int position = 0;
+ int minFlowGram = 100000000;
+ double minFlowValue = 1e8;
+ change[i] = 0; //FALSE
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ count += singleTau[seqNumber[cumNumSeqs[i] + j]];
+ }
+
+ if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
+ vector<double> adF(nSeqsPerOTU[i]);
+ vector<int> anL(nSeqsPerOTU[i]);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ int nIU = mapSeqToUnique[nI];
+
+ int k;
+ for(k=0;k<position;k++){
+ if(nIU == anL[k]){
+ break;
+ }
+ }
+ if(k == position){
+ anL[position] = nIU;
+ adF[position] = 0.0000;
+ position++;
+ }
+ }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+
+ double tauValue = singleTau[seqNumber[index]];
+
+ for(int k=0;k<position;k++){
+ double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
+ adF[k] += dist * tauValue;
+ }
+ }
+
+ for(int j=0;j<position;j++){
+ if(adF[j] < minFlowValue){
+ minFlowGram = j;
+ minFlowValue = adF[j];
+ }
+ }
+
+ if(centroids[i] != anL[minFlowGram]){
+ change[i] = 1;
+ centroids[i] = anL[minFlowGram];
+ }
+ }
+ else if(centroids[i] != -1){
+ change[i] = 1;
+ centroids[i] = -1;
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
+ try{
+
+ int flowAValue = cent * numFlowCells;
+ int flowBValue = flow * numFlowCells;
+
+ double dist = 0;
+
+ for(int i=0;i<length;i++){
+ dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
+ flowAValue++;
+ flowBValue++;
+ }
+
+ return dist / (double)length;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getDistToCentroid");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double ShhherCommand::getNewWeights(){
+ try{
+
+ double maxChange = 0;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ double difference = weight[i];
+ weight[i] = 0;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ weight[i] += tauValue;
+ }
+
+ difference = fabs(weight[i] - difference);
+ if(difference > maxChange){ maxChange = difference; }
+ }
+ return maxChange;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+ /**************************************************************************************************/
+
+double ShhherCommand::getLikelihood(){
+
+ try{
+
+ vector<long double> P(numSeqs, 0);
+ int effNumOTUs = 0;
+
+ for(int i=0;i<numOTUs;i++){
+ if(weight[i] > MIN_WEIGHT){
+ effNumOTUs++;
+ }
+ }
+
+ string hold;
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ double singleDist = dist[seqNumber[index]];
+
+ P[nI] += weight[i] * exp(-singleDist * sigma);
+ }
+ }
+ double nLL = 0.00;
+ for(int i=0;i<numSeqs;i++){
+ if(P[i] == 0){ P[i] = DBL_EPSILON; }
+
+ nLL += -log(P[i]);
+ }
+
+ nLL = nLL -(double)numSeqs * log(sigma);
+
+ return nLL;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::checkCentroids(){
+ try{
+ vector<int> unique(numOTUs, 1);
+
+ for(int i=0;i<numOTUs;i++){
+ if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
+ unique[i] = -1;
+ }
+ }
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ if(unique[i] == 1){
+ for(int j=i+1;j<numOTUs;j++){
+ if(unique[j] == 1){
+
+ if(centroids[j] == centroids[i]){
+ unique[j] = 0;
+ centroids[j] = -1;
+
+ weight[i] += weight[j];
+ weight[j] = 0.0;
+ }
+ }
+ }
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "checkCentroids");
+ exit(1);
+ }
+}
+ /**************************************************************************************************/
+
+
+
+void ShhherCommand::writeQualities(vector<int> otuCounts){
+
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
+
+ ofstream qualityFile;
+ m->openOutputFile(qualityFileName, qualityFile);
+
+ qualityFile.setf(ios::fixed, ios::floatfield);
+ qualityFile.setf(ios::showpoint);
+ qualityFile << setprecision(6);
+
+ vector<vector<int> > qualities(numOTUs);
+ vector<double> pr(HOMOPS, 0);
+
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = 0;
+ int base = 0;
+
+ if(nSeqsPerOTU[i] > 0){
+ qualities[i].assign(1024, -1);
+
+ while(index < numFlowCells){
+ double maxPrValue = 1e8;
+ short maxPrIndex = -1;
+ double count = 0.0000;
+
+ pr.assign(HOMOPS, 0);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int lIndex = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[lIndex]];
+ int sequenceIndex = aaI[i][j];
+ short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
+
+ count += tauValue;
+
+ for(int s=0;s<HOMOPS;s++){
+ pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
+ }
+ }
+
+ maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
+ maxPrValue = pr[maxPrIndex];
+
+ if(count > MIN_COUNT){
+ double U = 0.0000;
+ double norm = 0.0000;
+
+ for(int s=0;s<HOMOPS;s++){
+ norm += exp(-(pr[s] - maxPrValue));
+ }
+
+ for(int s=1;s<=maxPrIndex;s++){
+ int value = 0;
+ double temp = 0.0000;
+
+ U += exp(-(pr[s-1]-maxPrValue))/norm;
+
+ if(U>0.00){
+ temp = log10(U);
+ }
+ else{
+ temp = -10.1;
+ }
+ temp = floor(-10 * temp);
+ value = (int)floor(temp);
+ if(value > 100){ value = 100; }
+
+ qualities[i][base] = (int)value;
+ base++;
+ }
+ }
+
+ index++;
+ }
+ }
+
+
+ if(otuCounts[i] > 0){
+ qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
+
+ int j=4; //need to get past the first four bases
+ while(qualities[i][j] != -1){
+ qualityFile << qualities[i][j] << ' ';
+ j++;
+ }
+ qualityFile << endl;
+ }
+ }
+ qualityFile.close();
+ outputNames.push_back(qualityFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeQualities");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeSequences(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
+ ofstream fastaFile;
+ m->openOutputFile(fastaFileName, fastaFile);
+
+ vector<string> names(numOTUs, "");
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = centroids[i];
+
+ if(otuCounts[i] > 0){
+ fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
+
+ string newSeq = "";
+
+ for(int j=0;j<numFlowCells;j++){
+
+ char base = flowOrder[j % 4];
+ for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
+ newSeq += base;
+ }
+ }
+
+ fastaFile << newSeq.substr(4) << endl;
+ }
+ }
+ fastaFile.close();
+
+ outputNames.push_back(fastaFileName);
+
+ if(compositeFASTAFileName != ""){
+ m->appendFiles(fastaFileName, compositeFASTAFileName);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeSequences");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeNames(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
+ ofstream nameFile;
+ m->openOutputFile(nameFileName, nameFile);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ if(otuCounts[i] > 0){
+ nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
+
+ for(int j=1;j<nSeqsPerOTU[i];j++){
+ nameFile << ',' << seqNameVector[aaI[i][j]];
+ }
+
+ nameFile << endl;
+ }
+ }
+ nameFile.close();
+ outputNames.push_back(nameFileName);
+
+
+ if(compositeNamesFileName != ""){
+ m->appendFiles(nameFileName, compositeNamesFileName);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeNames");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeGroups(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string groupFileName = fileRoot + "shhh.groups";
+ ofstream groupFile;
+ m->openOutputFile(groupFileName, groupFile);
+
+ for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { break; }
+ groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
+ }
+ groupFile.close();
+ outputNames.push_back(groupFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeGroups");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeClusters(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
+ ofstream otuCountsFile;
+ m->openOutputFile(otuCountsFileName, otuCountsFile);
+
+ string bases = flowOrder;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) {
+ break;
+ }
+ //output the translated version of the centroid sequence for the otu
+ if(otuCounts[i] > 0){
+ int index = centroids[i];
+
+ otuCountsFile << "ideal\t";
+ for(int j=8;j<numFlowCells;j++){
+ char base = bases[j % 4];
+ for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
+ otuCountsFile << base;
+ }
+ }
+ otuCountsFile << endl;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int sequence = aaI[i][j];
+ otuCountsFile << seqNameVector[sequence] << '\t';
+
+ string newSeq = "";
+
+ for(int k=0;k<lengths[sequence];k++){
+ char base = bases[k % 4];
+ int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
+
+ for(int s=0;s<freq;s++){
+ newSeq += base;
+ //otuCountsFile << base;
+ }
+ }
+ otuCountsFile << newSeq.substr(4) << endl;
+ }
+ otuCountsFile << endl;
+ }
+ }
+ otuCountsFile.close();
+ outputNames.push_back(otuCountsFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeClusters");
+ exit(1);
+ }
+}
#else
//**********************************************************************************************************************
getSingleLookUp(); if (m->control_pressed) { return 0; }
getJointLookUp(); if (m->control_pressed) { return 0; }
-
- vector<string> flowFileVector;
- if(flowFilesFileName != "not found"){
- string fName;
+ int numFiles = flowFileVector.size();
+
+ if (numFiles < processors) { processors = numFiles; }
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
+ else { createProcesses(flowFileVector); } //each processor processes one file
+#else
+ driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
+#endif
+
+ if(compositeFASTAFileName != ""){
+ outputNames.push_back(compositeFASTAFileName);
+ outputNames.push_back(compositeNamesFileName);
+ }
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "execute");
+ exit(1);
+ }
+}
+#endif
+/**************************************************************************************************/
+
+int ShhherCommand::createProcesses(vector<string> filenames){
+ try {
+ vector<int> processIDS;
+ int process = 1;
+ int num = 0;
+
+ //sanity check
+ if (filenames.size() < processors) { processors = filenames.size(); }
+
+ //divide the groups between the processors
+ vector<linePair> lines;
+ int numFilesPerProcessor = filenames.size() / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numFilesPerProcessor;
+ int endIndex = (i+1) * numFilesPerProcessor;
+ if(i == (processors - 1)){ endIndex = filenames.size(); }
+ lines.push_back(linePair(startIndex, endIndex));
+ }
+
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
- ifstream flowFilesFile;
- m->openInputFile(flowFilesFileName, flowFilesFile);
- while(flowFilesFile){
- fName = m->getline(flowFilesFile);
- flowFileVector.push_back(fName);
- m->gobble(flowFilesFile);
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
}
}
- else{
- flowFileVector.push_back(flowFileName);
+
+ //do my part
+ driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processIDS.size();i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ #else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<shhhFlowsData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++ ){
+ // Allocate memory for thread data.
+ string extension = "";
+ if (i != 0) { extension = toString(i) + ".temp"; }
+
+ shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
+ pDataArray.push_back(tempFlow);
+ processIDS.push_back(i);
+
+ hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
}
- int numFiles = flowFileVector.size();
+ //using the main process as a worker saves time and memory
+ //do my part
+ driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
- for(int i=0;i<numFiles;i++){
+ #endif
+
+ for (int i=0;i<processIDS.size();i++) {
+ if (compositeFASTAFileName != "") {
+ m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
+ m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
+ m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
+ m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
+ }
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "createProcesses");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
+ try {
+
+ for(int i=start;i<end;i++){
if (m->control_pressed) { break; }
- flowFileName = flowFileVector[i];
-
- m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
+ string flowFileName = filenames[i];
+
+ m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
m->mothurOut("Reading flowgrams...\n");
- getFlowData();
+
+ vector<string> seqNameVector;
+ vector<int> lengths;
+ vector<short> flowDataIntI;
+ vector<double> flowDataPrI;
+ map<string, int> nameMap;
+ vector<short> uniqueFlowgrams;
+ vector<int> uniqueCount;
+ vector<int> mapSeqToUnique;
+ vector<int> mapUniqueToSeq;
+ vector<int> uniqueLengths;
+ int numFlowCells;
+
+ int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
if (m->control_pressed) { break; }
m->mothurOut("Identifying unique flowgrams...\n");
- getUniques();
+ int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
if (m->control_pressed) { break; }
m->mothurOut("Calculating distances between flowgrams...\n");
- string distFileName = createDistFile(processors);
- string namesFileName = createNamesFile();
+ string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
+ unsigned long long begTime = time(NULL);
+ double begClock = clock();
+
+ flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
+
+ m->mothurOutEndLine();
+ m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
+
+
+ string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
+ createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
if (m->control_pressed) { break; }
m->mothurOut("\nClustering flowgrams...\n");
- string listFileName = cluster(distFileName, namesFileName);
+ string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+ cluster(listFileName, distFileName, namesFileName);
if (m->control_pressed) { break; }
+
+ vector<int> otuData;
+ vector<int> cumNumSeqs;
+ vector<int> nSeqsPerOTU;
+ vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
+ vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
+ vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
+ vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
+
- getOTUData(listFileName);
+ int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
if (m->control_pressed) { break; }
m->mothurRemove(namesFileName);
m->mothurRemove(listFileName);
- initPyroCluster();
+ vector<double> dist; //adDist - distance of sequences to centroids
+ vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
+ vector<int> centroids; //the representative flowgram for each cluster m
+ vector<double> weight;
+ vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
+ vector<int> nSeqsBreaks;
+ vector<int> nOTUsBreaks;
+
+ dist.assign(numSeqs * numOTUs, 0);
+ change.assign(numOTUs, 1);
+ centroids.assign(numOTUs, -1);
+ weight.assign(numOTUs, 0);
+ singleTau.assign(numSeqs, 1.0);
+
+ nSeqsBreaks.assign(2, 0);
+ nOTUsBreaks.assign(2, 0);
+
+ nSeqsBreaks[0] = 0;
+ nSeqsBreaks[1] = numSeqs;
+ nOTUsBreaks[1] = numOTUs;
if (m->control_pressed) { break; }
double maxDelta = 0;
int iter = 0;
- double begClock = clock();
- unsigned long long begTime = time(NULL);
-
+ begClock = clock();
+ begTime = time(NULL);
+
m->mothurOut("\nDenoising flowgrams...\n");
m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
double cycClock = clock();
unsigned long long cycTime = time(NULL);
- fill();
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
if (m->control_pressed) { break; }
-
- calcCentroids();
+
+ calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
if (m->control_pressed) { break; }
-
- maxDelta = getNewWeights(); if (m->control_pressed) { break; }
- double nLL = getLikelihood(); if (m->control_pressed) { break; }
- checkCentroids();
+
+ maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
+
+ if (m->control_pressed) { break; }
+
+ double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
+
+ if (m->control_pressed) { break; }
+
+ checkCentroids(numOTUs, centroids, weight);
if (m->control_pressed) { break; }
- calcNewDistances();
+ calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
if (m->control_pressed) { break; }
iter++;
m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
-
+
}
if (m->control_pressed) { break; }
m->mothurOut("\nFinalizing...\n");
- fill();
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
if (m->control_pressed) { break; }
- setOTUs();
+ setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
if (m->control_pressed) { break; }
vector<int> otuCounts(numOTUs, 0);
for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
- calcCentroidsDriver(0, numOTUs); if (m->control_pressed) { break; }
- writeQualities(otuCounts); if (m->control_pressed) { break; }
- writeSequences(otuCounts); if (m->control_pressed) { break; }
- writeNames(otuCounts); if (m->control_pressed) { break; }
- writeClusters(otuCounts); if (m->control_pressed) { break; }
- writeGroups(); if (m->control_pressed) { break; }
+ calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+
+ if (m->control_pressed) { break; }
+
+ writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
+ writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
+ writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (m->control_pressed) { break; }
+ writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (m->control_pressed) { break; }
+ writeGroups(flowFileName, numSeqs, seqNameVector); if (m->control_pressed) { break; }
m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
}
- if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
-
-
- if(compositeFASTAFileName != ""){
- outputNames.push_back(compositeFASTAFileName);
- outputNames.push_back(compositeNamesFileName);
- }
-
- m->mothurOutEndLine();
- m->mothurOut("Output File Names: "); m->mothurOutEndLine();
- for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
- m->mothurOutEndLine();
-
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "execute");
- exit(1);
- }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ return 0;
+
+ }catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "driver");
+ exit(1);
+ }
}
-#endif
-/**************************************************************************************************/
-void ShhherCommand::getFlowData(){
+/**************************************************************************************************/
+int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
try{
+
ifstream flowFile;
- m->openInputFile(flowFileName, flowFile);
+
+ m->openInputFile(filename, flowFile);
string seqName;
- seqNameVector.clear();
- lengths.clear();
- flowDataIntI.clear();
- nameMap.clear();
-
-
int currentNumFlowCells;
-
float intensity;
+ thisSeqNameVector.clear();
+ thisLengths.clear();
+ thisFlowDataIntI.clear();
+ thisNameMap.clear();
flowFile >> numFlowCells;
int index = 0;//pcluster
if (m->control_pressed) { break; }
flowFile >> seqName >> currentNumFlowCells;
- lengths.push_back(currentNumFlowCells);
-
- seqNameVector.push_back(seqName);
- nameMap[seqName] = index++;//pcluster
+ thisLengths.push_back(currentNumFlowCells);
+
+ thisSeqNameVector.push_back(seqName);
+ thisNameMap[seqName] = index++;//pcluster
for(int i=0;i<numFlowCells;i++){
flowFile >> intensity;
if(intensity > 9.99) { intensity = 9.99; }
int intI = int(100 * intensity + 0.0001);
- flowDataIntI.push_back(intI);
+ thisFlowDataIntI.push_back(intI);
}
m->gobble(flowFile);
}
flowFile.close();
- numSeqs = seqNameVector.size();
+ int numSeqs = thisSeqNameVector.size();
for(int i=0;i<numSeqs;i++){
if (m->control_pressed) { break; }
int iNumFlowCells = i * numFlowCells;
- for(int j=lengths[i];j<numFlowCells;j++){
- flowDataIntI[iNumFlowCells + j] = 0;
+ for(int j=thisLengths[i];j<numFlowCells;j++){
+ thisFlowDataIntI[iNumFlowCells + j] = 0;
}
}
+
+ return numSeqs;
}
catch(exception& e) {
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::getSingleLookUp(){
- try{
- // these are the -log probabilities that a signal corresponds to a particular homopolymer length
- singleLookUp.assign(HOMOPS * NUMBINS, 0);
+int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
+ try{
+
+ ostringstream outStream;
+ outStream.setf(ios::fixed, ios::floatfield);
+ outStream.setf(ios::dec, ios::basefield);
+ outStream.setf(ios::showpoint);
+ outStream.precision(6);
- int index = 0;
- ifstream lookUpFile;
- m->openInputFile(lookupFileName, lookUpFile);
-
- for(int i=0;i<HOMOPS;i++){
+ int begTime = time(NULL);
+ double begClock = clock();
+
+ for(int i=0;i<stopSeq;i++){
if (m->control_pressed) { break; }
- float logFracFreq;
- lookUpFile >> logFracFreq;
-
- for(int j=0;j<NUMBINS;j++) {
- lookUpFile >> singleLookUp[index];
- index++;
+ for(int j=0;j<i;j++){
+ float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
+
+ if(flowDistance < 1e-6){
+ outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
+ }
+ else if(flowDistance <= cutoff){
+ outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
+ }
}
- }
- lookUpFile.close();
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getSingleLookUp");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::getJointLookUp(){
- try{
+ if(i % 100 == 0){
+ m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
+ m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
+ m->mothurOutEndLine();
+ }
+ }
- // the most likely joint probability (-log) that two intenities have the same polymer length
- jointLookUp.resize(NUMBINS * NUMBINS, 0);
+ ofstream distFile(distFileName.c_str());
+ distFile << outStream.str();
+ distFile.close();
- for(int i=0;i<NUMBINS;i++){
-
- if (m->control_pressed) { break; }
-
- for(int j=0;j<NUMBINS;j++){
-
- double minSum = 100000000;
-
- for(int k=0;k<HOMOPS;k++){
- double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
-
- if(sum < minSum) { minSum = sum; }
- }
- jointLookUp[i * NUMBINS + j] = minSum;
- }
+ if (m->control_pressed) {}
+ else {
+ m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
+ m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
+ m->mothurOutEndLine();
}
+
+ return 0;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getJointLookUp");
+ m->errorOut(e, "ShhherCommand", "flowDistParentFork");
exit(1);
}
}
-
/**************************************************************************************************/
-double ShhherCommand::getProbIntensity(int intIntensity){
+float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
try{
-
- double minNegLogProb = 100000000;
-
+ int minLength = lengths[mapSeqToUnique[seqA]];
+ if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
- for(int i=0;i<HOMOPS;i++){//loop signal strength
+ int ANumFlowCells = seqA * numFlowCells;
+ int BNumFlowCells = seqB * numFlowCells;
+
+ float dist = 0;
+
+ for(int i=0;i<minLength;i++){
if (m->control_pressed) { break; }
- float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
- if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
+ int flowAIntI = flowDataIntI[ANumFlowCells + i];
+ float flowAPrI = flowDataPrI[ANumFlowCells + i];
+
+ int flowBIntI = flowDataIntI[BNumFlowCells + i];
+ float flowBPrI = flowDataPrI[BNumFlowCells + i];
+ dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
}
- return minNegLogProb;
+ dist /= (float) minLength;
+ return dist;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getProbIntensity");
+ m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
exit(1);
}
}
/**************************************************************************************************/
-void ShhherCommand::getUniques(){
+int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
try{
-
-
- numUniques = 0;
+ int numUniques = 0;
uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
uniqueCount.assign(numSeqs, 0); // anWeights
uniqueLengths.assign(numSeqs, 0);
for(int j=0;j<numFlowCells;j++){
current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
}
-
+
for(int j=0;j<numUniques;j++){
int offset = j * numFlowCells;
bool toEnd = 1;
int shorterLength;
if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
else { shorterLength = uniqueLengths[j]; }
-
+
for(int k=0;k<shorterLength;k++){
if(current[k] != uniqueFlowgrams[offset + k]){
toEnd = 0;
flowDataPrI.resize(numSeqs * numFlowCells, 0);
for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
+
+ return numUniques;
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "getUniques");
exit(1);
}
}
-
/**************************************************************************************************/
-
-float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
- try{
- int minLength = lengths[mapSeqToUnique[seqA]];
- if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
-
- int ANumFlowCells = seqA * numFlowCells;
- int BNumFlowCells = seqB * numFlowCells;
-
- float dist = 0;
-
- for(int i=0;i<minLength;i++){
-
- if (m->control_pressed) { break; }
-
- int flowAIntI = flowDataIntI[ANumFlowCells + i];
- float flowAPrI = flowDataPrI[ANumFlowCells + i];
-
- int flowBIntI = flowDataIntI[BNumFlowCells + i];
- float flowBPrI = flowDataPrI[BNumFlowCells + i];
- dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
- }
-
- dist /= (float) minLength;
- return dist;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){
- try{
-
- ostringstream outStream;
- outStream.setf(ios::fixed, ios::floatfield);
- outStream.setf(ios::dec, ios::basefield);
- outStream.setf(ios::showpoint);
- outStream.precision(6);
-
- int begTime = time(NULL);
- double begClock = clock();
-
- for(int i=startSeq;i<stopSeq;i++){
-
- if (m->control_pressed) { break; }
-
- for(int j=0;j<i;j++){
- float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
-
- if(flowDistance < 1e-6){
- outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
- }
- else if(flowDistance <= cutoff){
- outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
- }
- }
- if(i % 100 == 0){
- m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
- m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
- m->mothurOutEndLine();
- }
- }
-
- ofstream distFile(distFileName.c_str());
- distFile << outStream.str();
- distFile.close();
-
- if (m->control_pressed) {}
- else {
- m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
- m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
- m->mothurOutEndLine();
- }
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "flowDistParentFork");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-string ShhherCommand::createDistFile(int processors){
- try{
-//////////////////////// until I figure out the shared memory issue //////////////////////
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-#else
- processors=1;
-#endif
-//////////////////////// until I figure out the shared memory issue //////////////////////
-
- string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
-
- unsigned long long begTime = time(NULL);
- double begClock = clock();
-
- if (numSeqs < processors){ processors = 1; }
-
- if(processors == 1) { flowDistParentFork(fDistFileName, 0, numUniques); }
-
- else{ //you have multiple processors
-
- vector<int> start(processors, 0);
- vector<int> end(processors, 0);
-
- int process = 1;
- vector<int> processIDs;
-
- for (int i = 0; i < processors; i++) {
- start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
- end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
- }
-
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDs.push_back(pid); //create map from line number to pid so you can append files in correct order later
- process++;
- }else if (pid == 0){
- flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]);
- exit(0);
- }else {
- m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine();
- perror(" : ");
- for (int i=0;i<processIDs.size();i++) { int temp = processIDs[i]; kill (temp, SIGINT); }
- exit(0);
- }
- }
-
- //parent does its part
- flowDistParentFork(fDistFileName, start[0], end[0]);
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processIDs.size();i++) {
- int temp = processIDs[i];
- wait(&temp);
- }
-#else
- //////////////////////////////////////////////////////////////////////////////////////////////////////
- //Windows version shared memory, so be careful when passing variables through the flowDistParentForkData struct.
- //Above fork() will clone, so memory is separate, but that's not the case with windows,
- //////////////////////////////////////////////////////////////////////////////////////////////////////
-
- vector<flowDistParentForkData*> pDataArray;
- DWORD dwThreadIdArray[processors-1];
- HANDLE hThreadArray[processors-1];
-
- //Create processor worker threads.
- for(int i = 0; i < processors-1; i++){
- // Allocate memory for thread data.
- string extension = extension = toString(i) + ".temp";
-
- flowDistParentForkData* tempdist = new flowDistParentForkData((fDistFileName + extension), mapUniqueToSeq, mapSeqToUnique, lengths, flowDataIntI, flowDataPrI, jointLookUp, m, start[i+1], end[i+1], numFlowCells, cutoff, i);
- pDataArray.push_back(tempdist);
- processIDs.push_back(i);
-
- //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
- //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
- hThreadArray[i] = CreateThread(NULL, 0, MyflowDistParentForkThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
- }
-
- //parent does its part
- flowDistParentFork(fDistFileName, start[0], end[0]);
-
- //Wait until all threads have terminated.
- WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
-
- //Close all thread handles and free memory allocations.
- for(int i=0; i < pDataArray.size(); i++){
- CloseHandle(hThreadArray[i]);
- delete pDataArray[i];
- }
-
-#endif
-
- //append and remove temp files
- for (int i=0;i<processIDs.size();i++) {
- m->appendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName);
- m->mothurRemove((fDistFileName + toString(processIDs[i]) + ".temp"));
- }
-
- }
-
- m->mothurOutEndLine();
- m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
-
- return fDistFileName;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "createDistFile");
- exit(1);
- }
-
-}
-
-/**************************************************************************************************/
-
-string ShhherCommand::createNamesFile(){
+int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
try{
vector<string> duplicateNames(numUniques, "");
duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
}
- string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
-
ofstream nameFile;
- m->openOutputFile(nameFileName, nameFile);
+ m->openOutputFile(filename, nameFile);
for(int i=0;i<numUniques;i++){
if (m->control_pressed) { break; }
-// nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
+ // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
}
nameFile.close();
- return nameFileName;
+
+ return 0;
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "createNamesFile");
exit(1);
}
}
-
//**********************************************************************************************************************
-string ShhherCommand::cluster(string distFileName, string namesFileName){
+int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
try {
ReadMatrix* read = new ReadColumnMatrix(distFileName);
delete read;
delete clusterNameMap;
-
+
RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
list->setLabel(toString(cutoff));
- string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
ofstream listFile;
- m->openOutputFile(listFileName, listFile);
+ m->openOutputFile(filename, listFile);
list->print(listFile);
listFile.close();
delete matrix; delete cluster; delete rabund; delete list;
-
- return listFileName;
+
+ return 0;
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "cluster");
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::getOTUData(string listFileName){
+int ShhherCommand::getOTUData(int numSeqs, string fileName, vector<int>& otuData,
+ vector<int>& cumNumSeqs,
+ vector<int>& nSeqsPerOTU,
+ vector<vector<int> >& aaP, //tMaster->aanP: each row is a different otu / each col contains the sequence indices
+ vector<vector<int> >& aaI, //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
+ vector<int>& seqNumber, //tMaster->anP: the sequence id number sorted by OTU
+ vector<int>& seqIndex,
+ map<string, int>& nameMap){
try {
-
+
ifstream listFile;
- m->openInputFile(listFileName, listFile);
+ m->openInputFile(fileName, listFile);
string label;
+ int numOTUs;
listFile >> label >> numOTUs;
-
+
otuData.assign(numSeqs, 0);
cumNumSeqs.assign(numOTUs, 0);
nSeqsPerOTU.assign(numOTUs, 0);
for(int i=0;i<numOTUs;i++){
if (m->control_pressed) { break; }
-
+
listFile >> singleOTU;
istringstream otuString(singleOTU);
-
+
while(otuString){
string seqName = "";
}
map<string,int>::iterator nmIt = nameMap.find(seqName);
-
+
int index = nmIt->second;
nameMap.erase(nmIt);
-
+
otuData[index] = i;
nSeqsPerOTU[i]++;
aaP[i].push_back(index);
seqIndex = seqNumber;
listFile.close();
+
+ return numOTUs;
}
catch(exception& e) {
exit(1);
}
}
-
-/**************************************************************************************************/
-
-void ShhherCommand::initPyroCluster(){
- try{
- if (numOTUs < processors) { processors = 1; }
-
- dist.assign(numSeqs * numOTUs, 0);
- change.assign(numOTUs, 1);
- centroids.assign(numOTUs, -1);
- weight.assign(numOTUs, 0);
- singleTau.assign(numSeqs, 1.0);
-
- nSeqsBreaks.assign(processors+1, 0);
- nOTUsBreaks.assign(processors+1, 0);
-
- nSeqsBreaks[0] = 0;
- for(int i=0;i<processors;i++){
- nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
- nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
- }
- nSeqsBreaks[processors] = numSeqs;
- nOTUsBreaks[processors] = numOTUs;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "initPyroCluster");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::fill(){
- try {
- int index = 0;
- for(int i=0;i<numOTUs;i++){
-
- if (m->control_pressed) { break; }
-
- cumNumSeqs[i] = index;
- for(int j=0;j<nSeqsPerOTU[i];j++){
- seqNumber[index] = aaP[i][j];
- seqIndex[index] = aaI[i][j];
-
- index++;
- }
- }
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "fill");
- exit(1);
- }
-}
-
/**************************************************************************************************/
-void ShhherCommand::calcCentroids(){
- try{
-
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-
- if(processors == 1) {
- calcCentroidsDriver(0, numOTUs);
- }
- else{ //you have multiple processors
- if (numOTUs < processors){ processors = 1; }
-
- int process = 1;
- vector<int> processIDs;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = vfork();
-
- if (pid > 0) {
- processIDs.push_back(pid); //create map from line number to pid so you can append files in correct order later
- process++;
- }else if (pid == 0){
- calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]);
- exit(0);
- }else {
- m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine();
- perror(" : ");
- for (int i=0;i<processIDs.size();i++) { int temp = processIDs[i]; kill (temp, SIGINT); }
- exit(0);
- }
- }
-
- //parent does its part
- calcCentroidsDriver(nOTUsBreaks[0], nOTUsBreaks[1]);
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processIDs.size();i++) {
- int temp = processIDs[i];
- wait(&temp);
- }
- }
-
-#else
- calcCentroidsDriver(0, numOTUs);
-#endif
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::calcCentroidsDriver(int start, int finish){
+int ShhherCommand::calcCentroidsDriver(int numOTUs,
+ vector<int>& cumNumSeqs,
+ vector<int>& nSeqsPerOTU,
+ vector<int>& seqIndex,
+ vector<short>& change, //did the centroid sequence change? 0 = no; 1 = yes
+ vector<int>& centroids, //the representative flowgram for each cluster m
+ vector<double>& singleTau, //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
+ vector<int>& mapSeqToUnique,
+ vector<short>& uniqueFlowgrams,
+ vector<short>& flowDataIntI,
+ vector<int>& lengths,
+ int numFlowCells,
+ vector<int>& seqNumber){
//this function gets the most likely homopolymer length at a flow position for a group of sequences
//within an otu
try{
- for(int i=start;i<finish;i++){
+ for(int i=0;i<numOTUs;i++){
if (m->control_pressed) { break; }
for(int j=0;j<nSeqsPerOTU[i];j++){
count += singleTau[seqNumber[cumNumSeqs[i] + j]];
}
-
+
if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
vector<double> adF(nSeqsPerOTU[i]);
vector<int> anL(nSeqsPerOTU[i]);
double tauValue = singleTau[seqNumber[index]];
for(int k=0;k<position;k++){
- double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
+ double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
adF[k] += dist * tauValue;
}
}
centroids[i] = -1;
}
}
+
+ return 0;
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
exit(1);
}
}
-
/**************************************************************************************************/
-double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
+double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
+ vector<short>& flowDataIntI, int numFlowCells){
try{
int flowAValue = cent * numFlowCells;
int flowBValue = flow * numFlowCells;
double dist = 0;
-
+
for(int i=0;i<length;i++){
dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
flowAValue++;
exit(1);
}
}
-
-/**************************************************************************************************/
-
-double ShhherCommand::getNewWeights(){
- try{
-
- double maxChange = 0;
-
- for(int i=0;i<numOTUs;i++){
-
- if (m->control_pressed) { break; }
-
- double difference = weight[i];
- weight[i] = 0;
-
- for(int j=0;j<nSeqsPerOTU[i];j++){
- int index = cumNumSeqs[i] + j;
- double tauValue = singleTau[seqNumber[index]];
- weight[i] += tauValue;
- }
-
- difference = fabs(weight[i] - difference);
- if(difference > maxChange){ maxChange = difference; }
- }
- return maxChange;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getNewWeights");
- exit(1);
- }
-}
-
/**************************************************************************************************/
-double ShhherCommand::getLikelihood(){
-
- try{
-
- vector<long double> P(numSeqs, 0);
- int effNumOTUs = 0;
-
- for(int i=0;i<numOTUs;i++){
- if(weight[i] > MIN_WEIGHT){
- effNumOTUs++;
- }
- }
-
- string hold;
- for(int i=0;i<numOTUs;i++){
-
- if (m->control_pressed) { break; }
-
- for(int j=0;j<nSeqsPerOTU[i];j++){
- int index = cumNumSeqs[i] + j;
- int nI = seqIndex[index];
- double singleDist = dist[seqNumber[index]];
-
- P[nI] += weight[i] * exp(-singleDist * sigma);
- }
- }
- double nLL = 0.00;
- for(int i=0;i<numSeqs;i++){
- if(P[i] == 0){ P[i] = DBL_EPSILON; }
-
- nLL += -log(P[i]);
- }
-
- nLL = nLL -(double)numSeqs * log(sigma);
-
- return nLL;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getNewWeights");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::checkCentroids(){
- try{
- vector<int> unique(numOTUs, 1);
-
- for(int i=0;i<numOTUs;i++){
- if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
- unique[i] = -1;
- }
- }
-
- for(int i=0;i<numOTUs;i++){
-
- if (m->control_pressed) { break; }
-
- if(unique[i] == 1){
- for(int j=i+1;j<numOTUs;j++){
- if(unique[j] == 1){
-
- if(centroids[j] == centroids[i]){
- unique[j] = 0;
- centroids[j] = -1;
-
- weight[i] += weight[j];
- weight[j] = 0.0;
- }
- }
- }
- }
- }
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "checkCentroids");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::calcNewDistances(){
- try{
-
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-
- if(processors == 1) {
- calcNewDistancesParent(0, numSeqs);
- }
- else{ //you have multiple processors
- if (numSeqs < processors){ processors = 1; }
-
- vector<vector<int> > child_otuIndex(processors);
- vector<vector<int> > child_seqIndex(processors);
- vector<vector<double> > child_singleTau(processors);
- vector<int> totals(processors);
-
- int process = 1;
- vector<int> processIDs;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = vfork();
-
- if (pid > 0) {
- processIDs.push_back(pid); //create map from line number to pid so you can append files in correct order later
- process++;
- }else if (pid == 0){
- calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]);
- totals[process] = child_otuIndex[process].size();
-
- exit(0);
- }else {
- m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine();
- perror(" : ");
- for (int i=0;i<processIDs.size();i++) { int temp = processIDs[i]; kill (temp, SIGINT); }
- exit(0);
- }
- }
-
- //parent does its part
- calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
- int total = seqIndex.size();
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processIDs.size();i++) {
- int temp = processIDs[i];
- wait(&temp);
- }
-
- for(int i=1;i<processors;i++){
- int oldTotal = total;
- total += totals[i];
-
- singleTau.resize(total, 0);
- seqIndex.resize(total, 0);
- seqNumber.resize(total, 0);
-
- int childIndex = 0;
-
- for(int j=oldTotal;j<total;j++){
- int otuI = child_otuIndex[i][childIndex];
- int seqI = child_seqIndex[i][childIndex];
-
- singleTau[j] = child_singleTau[i][childIndex];
- aaP[otuI][nSeqsPerOTU[otuI]] = j;
- aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
- nSeqsPerOTU[otuI]++;
-
- childIndex++;
- }
- }
- }
-#else
- calcNewDistancesParent(0, numSeqs);
-#endif
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistances");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-#ifdef USE_MPI
-void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
-
- try{
- vector<double> newTau(numOTUs,0);
- vector<double> norms(numSeqs, 0);
- otuIndex.clear();
- seqIndex.clear();
- singleTau.clear();
-
- for(int i=startSeq;i<stopSeq;i++){
-
- if (m->control_pressed) { break; }
-
- double offset = 1e8;
- int indexOffset = i * numOTUs;
-
- for(int j=0;j<numOTUs;j++){
-
- if(weight[j] > MIN_WEIGHT && change[j] == 1){
- dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
- }
- if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
- offset = dist[indexOffset + j];
- }
- }
-
- for(int j=0;j<numOTUs;j++){
- if(weight[j] > MIN_WEIGHT){
- newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
- norms[i] += newTau[j];
- }
- else{
- newTau[j] = 0.0;
- }
- }
-
- for(int j=0;j<numOTUs;j++){
-
- newTau[j] /= norms[i];
-
- if(newTau[j] > MIN_TAU){
- otuIndex.push_back(j);
- seqIndex.push_back(i);
- singleTau.push_back(newTau[j]);
- }
+double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
+ try{
+
+ double maxChange = 0;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ double difference = weight[i];
+ weight[i] = 0;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ weight[i] += tauValue;
}
+ difference = fabs(weight[i] - difference);
+ if(difference > maxChange){ maxChange = difference; }
}
+ return maxChange;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
exit(1);
}
}
-#endif
+
/**************************************************************************************************/
-void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>& child_otuIndex, vector<int>& child_seqIndex, vector<double>& child_singleTau){
+double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
try{
- vector<double> newTau(numOTUs,0);
- vector<double> norms(numSeqs, 0);
- child_otuIndex.resize(0);
- child_seqIndex.resize(0);
- child_singleTau.resize(0);
- for(int i=startSeq;i<stopSeq;i++){
+ vector<long double> P(numSeqs, 0);
+ int effNumOTUs = 0;
+
+ for(int i=0;i<numOTUs;i++){
+ if(weight[i] > MIN_WEIGHT){
+ effNumOTUs++;
+ }
+ }
+
+ string hold;
+ for(int i=0;i<numOTUs;i++){
if (m->control_pressed) { break; }
- double offset = 1e8;
- int indexOffset = i * numOTUs;
-
-
- for(int j=0;j<numOTUs;j++){
- if(weight[j] > MIN_WEIGHT && change[j] == 1){
- dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
- }
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ double singleDist = dist[seqNumber[index]];
- if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
- offset = dist[indexOffset + j];
- }
+ P[nI] += weight[i] * exp(-singleDist * sigma);
}
-
- for(int j=0;j<numOTUs;j++){
- if(weight[j] > MIN_WEIGHT){
- newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
- norms[i] += newTau[j];
- }
- else{
- newTau[j] = 0.0;
- }
+ }
+ double nLL = 0.00;
+ for(int i=0;i<numSeqs;i++){
+ if(P[i] == 0){ P[i] = DBL_EPSILON; }
+
+ nLL += -log(P[i]);
+ }
+
+ nLL = nLL -(double)numSeqs * log(sigma);
+
+ return nLL;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
+ try{
+ vector<int> unique(numOTUs, 1);
+
+ for(int i=0;i<numOTUs;i++){
+ if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
+ unique[i] = -1;
}
+ }
+
+ for(int i=0;i<numOTUs;i++){
- for(int j=0;j<numOTUs;j++){
- newTau[j] /= norms[i];
-
- if(newTau[j] > MIN_TAU){
- child_otuIndex.push_back(j);
- child_seqIndex.push_back(i);
- child_singleTau.push_back(newTau[j]);
+ if (m->control_pressed) { break; }
+
+ if(unique[i] == 1){
+ for(int j=i+1;j<numOTUs;j++){
+ if(unique[j] == 1){
+
+ if(centroids[j] == centroids[i]){
+ unique[j] = 0;
+ centroids[j] = -1;
+
+ weight[i] += weight[j];
+ weight[j] = 0.0;
+ }
+ }
}
}
}
+
+ return 0;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistancesChild");
+ m->errorOut(e, "ShhherCommand", "checkCentroids");
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
+void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist,
+ vector<double>& weight, vector<short>& change, vector<int>& centroids,
+ vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,
+ vector<int>& seqNumber, vector<int>& seqIndex,
+ vector<short>& uniqueFlowgrams,
+ vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
try{
vector<double> newTau(numOTUs,0);
vector<double> norms(numSeqs, 0);
nSeqsPerOTU.assign(numOTUs, 0);
-
- for(int i=startSeq;i<stopSeq;i++){
+
+ for(int i=0;i<numSeqs;i++){
if (m->control_pressed) { break; }
int indexOffset = i * numOTUs;
-
+
double offset = 1e8;
for(int j=0;j<numOTUs;j++){
-
+
if(weight[j] > MIN_WEIGHT && change[j] == 1){
- dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
+ dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
}
-
+
if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
offset = dist[indexOffset + j];
}
}
-
+
for(int j=0;j<numOTUs;j++){
if(weight[j] > MIN_WEIGHT){
newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
newTau[j] = 0.0;
}
}
-
+
for(int j=0;j<numOTUs;j++){
newTau[j] /= norms[i];
}
-
+
for(int j=0;j<numOTUs;j++){
if(newTau[j] > MIN_TAU){
nSeqsPerOTU[j]++;
}
}
-
+
}
-
+
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
+ m->errorOut(e, "ShhherCommand", "calcNewDistances");
exit(1);
}
}
+/**************************************************************************************************/
+int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
+ try {
+ int index = 0;
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ cumNumSeqs[i] = index;
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber[index] = aaP[i][j];
+ seqIndex[index] = aaI[i][j];
+
+ index++;
+ }
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "fill");
+ exit(1);
+ }
+}
/**************************************************************************************************/
-void ShhherCommand::setOTUs(){
+void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
+ vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
try {
vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
nSeqsPerOTU[index]++;
}
- fill();
+
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistances");
+ m->errorOut(e, "ShhherCommand", "setOTUs");
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::writeQualities(vector<int> otuCounts){
+void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
+ vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
+ vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
try {
string thisOutputDir = outputDir;
- if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
- string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.qual";
-
+ if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
+ string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual";
+
ofstream qualityFile;
m->openOutputFile(qualityFileName, qualityFile);
-
+
qualityFile.setf(ios::fixed, ios::floatfield);
qualityFile.setf(ios::showpoint);
qualityFile << setprecision(6);
}
qualityFile.close();
outputNames.push_back(qualityFileName);
-
+
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "writeQualities");
/**************************************************************************************************/
-void ShhherCommand::writeSequences(vector<int> otuCounts){
+void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
try {
string thisOutputDir = outputDir;
- if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
- string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.fasta";
+ if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
+ string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta";
ofstream fastaFile;
m->openOutputFile(fastaFileName, fastaFile);
}
}
fastaFile.close();
-
+
outputNames.push_back(fastaFileName);
-
- if(compositeFASTAFileName != ""){
- m->appendFiles(fastaFileName, compositeFASTAFileName);
+
+ if(thisCompositeFASTAFileName != ""){
+ m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
}
}
catch(exception& e) {
/**************************************************************************************************/
-void ShhherCommand::writeNames(vector<int> otuCounts){
+void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
try {
string thisOutputDir = outputDir;
- if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
- string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.names";
+ if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
+ string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names";
ofstream nameFile;
m->openOutputFile(nameFileName, nameFile);
outputNames.push_back(nameFileName);
- if(compositeNamesFileName != ""){
- m->appendFiles(nameFileName, compositeNamesFileName);
+ if(thisCompositeNamesFileName != ""){
+ m->appendFiles(nameFileName, thisCompositeNamesFileName);
}
}
catch(exception& e) {
/**************************************************************************************************/
-void ShhherCommand::writeGroups(){
+void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& seqNameVector){
try {
string thisOutputDir = outputDir;
- if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
- string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
- string groupFileName = fileRoot + ".shhh.groups";
+ if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
+ string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename));
+ string groupFileName = fileRoot + "shhh.groups";
ofstream groupFile;
m->openOutputFile(groupFileName, groupFile);
}
groupFile.close();
outputNames.push_back(groupFileName);
-
+
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "writeGroups");
/**************************************************************************************************/
-void ShhherCommand::writeClusters(vector<int> otuCounts){
+void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
try {
string thisOutputDir = outputDir;
- if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
- string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.counts";
+ if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
+ string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts";
ofstream otuCountsFile;
m->openOutputFile(otuCountsFileName, otuCountsFile);
for(int k=0;k<lengths[sequence];k++){
char base = bases[k % 4];
int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
-
+
for(int s=0;s<freq;s++){
newSeq += base;
//otuCountsFile << base;
}
otuCountsFile.close();
outputNames.push_back(otuCountsFileName);
-
+
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "writeClusters");
}
}
-//**********************************************************************************************************************
+/**************************************************************************************************/
+
+void ShhherCommand::getSingleLookUp(){
+ try{
+ // these are the -log probabilities that a signal corresponds to a particular homopolymer length
+ singleLookUp.assign(HOMOPS * NUMBINS, 0);
+
+ int index = 0;
+ ifstream lookUpFile;
+ m->openInputFile(lookupFileName, lookUpFile);
+
+ for(int i=0;i<HOMOPS;i++){
+
+ if (m->control_pressed) { break; }
+
+ float logFracFreq;
+ lookUpFile >> logFracFreq;
+
+ for(int j=0;j<NUMBINS;j++) {
+ lookUpFile >> singleLookUp[index];
+ index++;
+ }
+ }
+ lookUpFile.close();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getSingleLookUp");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::getJointLookUp(){
+ try{
+
+ // the most likely joint probability (-log) that two intenities have the same polymer length
+ jointLookUp.resize(NUMBINS * NUMBINS, 0);
+
+ for(int i=0;i<NUMBINS;i++){
+
+ if (m->control_pressed) { break; }
+
+ for(int j=0;j<NUMBINS;j++){
+
+ double minSum = 100000000;
+
+ for(int k=0;k<HOMOPS;k++){
+ double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
+
+ if(sum < minSum) { minSum = sum; }
+ }
+ jointLookUp[i * NUMBINS + j] = minSum;
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getJointLookUp");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double ShhherCommand::getProbIntensity(int intIntensity){
+ try{
+
+ double minNegLogProb = 100000000;
+
+
+ for(int i=0;i<HOMOPS;i++){//loop signal strength
+
+ if (m->control_pressed) { break; }
+
+ float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
+ if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
+ }
+
+ return minNegLogProb;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getProbIntensity");
+ exit(1);
+ }
+}
+
+
+
+
#include "mothur.h"
#include "command.hpp"
+#include "readcolumn.h"
+#include "readmatrix.hpp"
+#include "rabundvector.hpp"
+#include "sabundvector.hpp"
+#include "listvector.hpp"
+#include "cluster.hpp"
+#include "sparsematrix.hpp"
+#include <cfloat>
//**********************************************************************************************************************
void help() { m->mothurOut(getHelpString()); }
private:
+ struct linePair {
+ int start;
+ int end;
+ linePair(int i, int j) : start(i), end(j) {}
+ };
+
int abort;
-
string outputDir, flowFileName, flowFilesFileName, lookupFileName, compositeFASTAFileName, compositeNamesFileName;
int processors, maxIters;
float cutoff, sigma, minDelta;
string flowOrder;
-
- vector<int> nSeqsBreaks;
- vector<int> nOTUsBreaks;
+
+ vector<string> outputNames;
vector<double> singleLookUp;
vector<double> jointLookUp;
+ vector<string> flowFileVector;
+
+ int driver(vector<string>, string, string, int, int);
+ int createProcesses(vector<string>);
+ int getFlowData(string, vector<string>&, vector<int>&, vector<short>&, map<string, int>&, int&);
+ int getUniques(int, int, vector<short>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
+ int flowDistParentFork(int, string, int, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
+ float calcPairwiseDist(int, int, int, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
+ int createNamesFile(int, int, string, vector<string>&, vector<int>&, vector<int>&);
+ int cluster(string, string, string);
+ int getOTUData(int numSeqs, string, vector<int>&, vector<int>&, vector<int>&, vector<vector<int> >&, vector<vector<int> >&, vector<int>&, vector<int>&,map<string, int>&);
+ int calcCentroidsDriver(int numOTUs, vector<int>&, vector<int>&, vector<int>&, vector<short>&, vector<int>&, vector<double>&, vector<int>&, vector<short>&, vector<short>&, vector<int>&, int, vector<int>&);
+ double getDistToCentroid(int, int, int, vector<short>&, vector<short>&, int);
+ double getNewWeights(int, vector<int>&, vector<int>&, vector<double>&, vector<int>&, vector<double>&);
+
+ double getLikelihood(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<double>&);
+ int checkCentroids(int, vector<int>&, vector<double>&);
+ void calcNewDistances(int, int, vector<int>& , vector<double>&,vector<double>& , vector<short>& change, vector<int>&,vector<vector<int> >&, vector<double>&, vector<vector<int> >&, vector<int>&, vector<int>&, vector<short>&, vector<short>&, int, vector<int>&);
+ int fill(int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<vector<int> >&, vector<vector<int> >&);
+ void setOTUs(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&,
+ vector<int>&, vector<double>&, vector<double>&, vector<vector<int> >&, vector<vector<int> >&);
+ void writeQualities(int, int, string, vector<int>, vector<int>&, vector<int>&, vector<double>&, vector<short>&, vector<short>&, vector<int>&, vector<int>&, vector<string>&, vector<int>&, vector<vector<int> >&);
+ void writeSequences(string, int, int, string, vector<int>, vector<short>&, vector<string>&, vector<vector<int> >&, vector<int>&);
+ void writeNames(string, int, string, vector<int>, vector<string>&, vector<vector<int> >&, vector<int>&);
+ void writeGroups(string, int, vector<string>&);
+ void writeClusters(string, int, int, vector<int>, vector<int>&, vector<short>&, vector<string>&, vector<vector<int> >&, vector<int>&, vector<int>&, vector<short>&);
+
+ void getSingleLookUp();
+ void getJointLookUp();
+ double getProbIntensity(int);
- vector<string> seqNameVector;
+
+#ifdef USE_MPI
+ string flowDistMPI(int, int);
+ void calcNewDistancesChildMPI(int, int, vector<int>&);
+
+ int pid, ncpus;
+
+ void getFlowData();
+ void getUniques();
+
+ float calcPairwiseDist(int, int);
+ void flowDistParentFork(string, int, int);
+
+ string createDistFile(int);
+ string createNamesFile();
+ string cluster(string, string);
+
+ void getOTUData(string);
+ void initPyroCluster();
+ void fill();
+ void calcCentroids();
+ void calcCentroidsDriver(int, int);
+ double getDistToCentroid(int, int, int);
+ double getNewWeights();
+ double getLikelihood();
+ void checkCentroids();
+ void calcNewDistances();
+ void calcNewDistancesParent(int, int);
+ void calcNewDistancesChild(int, int, vector<int>&, vector<int>&, vector<double>&);
+
+
+ void setOTUs();
+ void writeQualities(vector<int>);
+ void writeSequences(vector<int>);
+ void writeNames(vector<int>);
+ void writeGroups();
+ void writeClusters(vector<int>);
+
+ vector<string> seqNameVector;
vector<int> lengths;
vector<short> flowDataIntI;
vector<double> flowDataPrI;
vector<int> mapSeqToUnique;
vector<int> mapUniqueToSeq;
vector<int> uniqueLengths;
+ int numSeqs, numUniques, numOTUs, numFlowCells;
+ vector<int> nSeqsBreaks;
+ vector<int> nOTUsBreaks;
- vector<string> outputNames;
-
- int numSeqs, numUniques, numOTUs, numFlowCells;
-
- void getSingleLookUp();
- void getJointLookUp();
- void getFlowData();
- void getUniques();
- double getProbIntensity(int);
- float calcPairwiseDist(int, int);
- void flowDistParentFork(string, int, int);
-
- string createDistFile(int);
- string createNamesFile();
- string cluster(string, string);
-
- void getOTUData(string);
- void initPyroCluster();
- void fill();
- void calcCentroids();
- void calcCentroidsDriver(int, int);
- double getDistToCentroid(int, int, int);
- double getNewWeights();
- double getLikelihood();
- void checkCentroids();
- void calcNewDistances();
- void calcNewDistancesParent(int, int);
- void calcNewDistancesChild(int, int, vector<int>&, vector<int>&, vector<double>&);
-
-
- void setOTUs();
- void writeQualities(vector<int>);
- void writeSequences(vector<int>);
- void writeNames(vector<int>);
- void writeGroups();
- void writeClusters(vector<int>);
-
-
-#ifdef USE_MPI
- string flowDistMPI(int, int);
- void calcNewDistancesChildMPI(int, int, vector<int>&);
-
- int pid, ncpus;
#endif
};
//custom data structure for threads to use.
// This is passed by void pointer so it can be any data type
// that can be passed using a single void pointer (LPVOID).
-struct flowDistParentForkData {
- string distFileName;
- vector<int> mapUniqueToSeq;
- vector<int> mapSeqToUnique;
- vector<int> lengths;
- vector<short> flowDataIntI;
- vector<double> flowDataPrI;
+struct shhhFlowsData {
+ int threadID, maxIters;
+ float cutoff, sigma, minDelta;
+ string flowOrder;
+ vector<double> singleLookUp;
vector<double> jointLookUp;
+ vector<string> filenames;
+ vector<string> outputNames;
+ string thisCompositeFASTAFileName, thisCompositeNameFileName, outputDir;
+ int start, stop;
MothurOut* m;
- int threadID, startSeq, stopSeq, numFlowCells;
- float cutoff;
- flowDistParentForkData(){}
- flowDistParentForkData(string d, vector<int> mapU, vector<int> mapS, vector<int> l, vector<short> flowD, vector<double> flowDa, vector<double> j, MothurOut* mout, int st, int sp, int n, float cut, int tid) {
- distFileName = d;
- mapUniqueToSeq = mapU;
- mapSeqToUnique = mapS;
- lengths = l;
- flowDataIntI = flowD;
- flowDataPrI = flowDa;
- jointLookUp = j;
+ shhhFlowsData(){}
+ shhhFlowsData(vector<string> f, string cf, string cn, string ou, string flor, vector<double> jl, vector<double> sl, MothurOut* mout, int st, int sp, float cut, float si, float mD, int mx, int tid) {
+ filenames = f;
+ thisCompositeFASTAFileName = cf;
+ thisCompositeNameFileName = cn;
+ outputDir = ou;
+ flowOrder = flor;
m = mout;
- startSeq = st;
- stopSeq = sp;
- numFlowCells = n;
+ start = st;
+ stop = sp;
cutoff= cut;
+ sigma = si;
+ minDelta = mD;
+ maxIters = mx;
+ jointLookUp = jl;
+ singleLookUp = sl;
threadID = tid;
}
};
/**************************************************************************************************/
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
#else
-static DWORD WINAPI MyflowDistParentForkThreadFunction(LPVOID lpParam){
- flowDistParentForkData* pDataArray;
- pDataArray = (flowDistParentForkData*)lpParam;
+static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
+ shhhFlowsData* pDataArray;
+ pDataArray = (shhhFlowsData*)lpParam;
try {
- ostringstream outStream;
- outStream.setf(ios::fixed, ios::floatfield);
- outStream.setf(ios::dec, ios::basefield);
- outStream.setf(ios::showpoint);
- outStream.precision(6);
-
- int begTime = time(NULL);
- double begClock = clock();
- string tempOut = "start and end = " + toString(pDataArray->startSeq) +'\t' + toString(pDataArray->stopSeq) + "-";
- cout << tempOut << endl;
+
+ for(int l=pDataArray->start;l<pDataArray->stop;l++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ string flowFileName = pDataArray->filenames[l];
+
+ pDataArray->m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(l+1) + " of " + toString(pDataArray->filenames.size()) + ")\t<<<<<\n");
+ pDataArray->m->mothurOut("Reading flowgrams...\n");
+
+ vector<string> seqNameVector;
+ vector<int> lengths;
+ vector<short> flowDataIntI;
+ vector<double> flowDataPrI;
+ map<string, int> nameMap;
+ vector<short> uniqueFlowgrams;
+ vector<int> uniqueCount;
+ vector<int> mapSeqToUnique;
+ vector<int> mapUniqueToSeq;
+ vector<int> uniqueLengths;
+ int numFlowCells;
+
+ //int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
+ /*****************************************************************************************************/
+
+ ifstream flowFile;
+ // cout << "herethread " << flowFileName << '\t' << &flowFile << endl;
+ pDataArray->m->openInputFile(flowFileName, flowFile);
+
+ // cout << "herethread " << flowFileName << endl;
+ string seqName;
+ int currentNumFlowCells;
+ float intensity;
+
+ flowFile >> numFlowCells;
+ int index = 0;//pcluster
+ while(!flowFile.eof()){
+
+ if (pDataArray->m->control_pressed) { flowFile.close(); return 0; }
+
+ flowFile >> seqName >> currentNumFlowCells;
+ lengths.push_back(currentNumFlowCells);
+ // cout << "herethread " << seqName << endl;
+ seqNameVector.push_back(seqName);
+ nameMap[seqName] = index++;//pcluster
+
+ for(int i=0;i<numFlowCells;i++){
+ flowFile >> intensity;
+ if(intensity > 9.99) { intensity = 9.99; }
+ int intI = int(100 * intensity + 0.0001);
+ flowDataIntI.push_back(intI);
+ }
+ pDataArray->m->gobble(flowFile);
+ }
+ flowFile.close();
+
+ int numSeqs = seqNameVector.size();
+ // cout << numSeqs << endl;
+ for(int i=0;i<numSeqs;i++){
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ int iNumFlowCells = i * numFlowCells;
+ for(int j=lengths[i];j<numFlowCells;j++){
+ flowDataIntI[iNumFlowCells + j] = 0;
+ }
+ }
+ // cout << "here" << endl;
+ /*****************************************************************************************************/
- for(int i=pDataArray->startSeq;i<pDataArray->stopSeq;i++){
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ pDataArray->m->mothurOut("Identifying unique flowgrams...\n");
+ //int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
+ /*****************************************************************************************************/
+ int numUniques = 0;
+ uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
+ uniqueCount.assign(numSeqs, 0); // anWeights
+ uniqueLengths.assign(numSeqs, 0);
+ mapSeqToUnique.assign(numSeqs, -1);
+ mapUniqueToSeq.assign(numSeqs, -1);
+
+ vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ int index = 0;
+
+ vector<short> current(numFlowCells);
+ for(int j=0;j<numFlowCells;j++){
+ current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
+ }
+
+ for(int j=0;j<numUniques;j++){
+ int offset = j * numFlowCells;
+ bool toEnd = 1;
+
+ int shorterLength;
+ if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
+ else { shorterLength = uniqueLengths[j]; }
+
+ for(int k=0;k<shorterLength;k++){
+ if(current[k] != uniqueFlowgrams[offset + k]){
+ toEnd = 0;
+ break;
+ }
+ }
+
+ if(toEnd){
+ mapSeqToUnique[i] = j;
+ uniqueCount[j]++;
+ index = j;
+ if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
+ break;
+ }
+ index++;
+ }
+
+ if(index == numUniques){
+ uniqueLengths[numUniques] = lengths[i];
+ uniqueCount[numUniques] = 1;
+ mapSeqToUnique[i] = numUniques;//anMap
+ mapUniqueToSeq[numUniques] = i;//anF
+
+ for(int k=0;k<numFlowCells;k++){
+ uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
+ uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
+ }
+
+ numUniques++;
+ }
+ }
+ uniqueFlowDataIntI.resize(numFlowCells * numUniques);
+ uniqueLengths.resize(numUniques);
+
+ flowDataPrI.resize(numSeqs * numFlowCells, 0);
+ for(int i=0;i<flowDataPrI.size();i++) {
+ if (pDataArray->m->control_pressed) { return 0; }
+ //flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);
+
+ flowDataPrI[i] = 100000000;
+
+ for(int j=0;j<HOMOPS;j++){//loop signal strength
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ float negLogProb = pDataArray->singleLookUp[j * NUMBINS + flowDataIntI[i]];
+ if(negLogProb < flowDataPrI[i]) { flowDataPrI[i] = negLogProb; }
+ }
+ }
+
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ pDataArray->m->mothurOut("Calculating distances between flowgrams...\n");
+ string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
+ unsigned long long begTime = time(NULL);
+ double begClock = clock();
+
+ //flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
+ /*****************************************************************************************************/
+ ostringstream outStream;
+ outStream.setf(ios::fixed, ios::floatfield);
+ outStream.setf(ios::dec, ios::basefield);
+ outStream.setf(ios::showpoint);
+ outStream.precision(6);
+
+ int thisbegTime = time(NULL);
+ double thisbegClock = clock();
+
+ for(int i=0;i<numUniques;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ for(int j=0;j<i;j++){
+ //float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
+ /*****************************************************************************************************/
+ int seqA = mapUniqueToSeq[i]; int seqB = mapUniqueToSeq[j];
+ int minLength = lengths[mapSeqToUnique[seqA]];
+ if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
+
+ int ANumFlowCells = seqA * numFlowCells;
+ int BNumFlowCells = seqB * numFlowCells;
+
+ float flowDistance = 0;
+
+ for(int k=0;k<minLength;k++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ int flowAIntI = flowDataIntI[ANumFlowCells + k];
+ float flowAPrI = flowDataPrI[ANumFlowCells + k];
+
+ int flowBIntI = flowDataIntI[BNumFlowCells + k];
+ float flowBPrI = flowDataPrI[BNumFlowCells + k];
+ flowDistance += pDataArray->jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
+ }
+
+ flowDistance /= (float) minLength;
+ /*****************************************************************************************************/
+
+ if(flowDistance < 1e-6){
+ outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
+ }
+ else if(flowDistance <= pDataArray->cutoff){
+ outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
+ }
+ }
+ if(i % 100 == 0){
+ pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - thisbegTime));
+ pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC));
+ pDataArray->m->mothurOutEndLine();
+ }
+ }
+
+ ofstream distFile(distFileName.c_str());
+ distFile << outStream.str();
+ distFile.close();
+
+ if (pDataArray->m->control_pressed) {}
+ else {
+ pDataArray->m->mothurOut(toString(numUniques-1) + "\t" + toString(time(NULL) - thisbegTime));
+ pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC));
+ pDataArray->m->mothurOutEndLine();
+ }
+ /*****************************************************************************************************/
+
+ pDataArray->m->mothurOutEndLine();
+ pDataArray->m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
+
+ string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
+ //createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
+ /*****************************************************************************************************/
+ vector<string> duplicateNames(numUniques, "");
+ for(int i=0;i<numSeqs;i++){
+ duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
+ }
+
+ ofstream nameFile;
+ pDataArray->m->openOutputFile(namesFileName, nameFile);
+
+ for(int i=0;i<numUniques;i++){
+ if (pDataArray->m->control_pressed) { nameFile.close(); return 0; }
+ nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
+ }
+ nameFile.close();
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ pDataArray->m->mothurOut("\nClustering flowgrams...\n");
+ string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+ //cluster(listFileName, distFileName, namesFileName);
+ /*****************************************************************************************************/
+ ReadMatrix* read = new ReadColumnMatrix(distFileName);
+ read->setCutoff(pDataArray->cutoff);
+
+ NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
+ clusterNameMap->readMap();
+ read->read(clusterNameMap);
+
+ ListVector* list = read->getListVector();
+ SparseMatrix* matrix = read->getMatrix();
+
+ delete read;
+ delete clusterNameMap;
+
+ RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
+
+ Cluster* cluster = new CompleteLinkage(rabund, list, matrix, pDataArray->cutoff, "furthest");
+ string tag = cluster->getTag();
+
+ double clusterCutoff = pDataArray->cutoff;
+ while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ cluster->update(clusterCutoff);
+ }
+
+ list->setLabel(toString(pDataArray->cutoff));
+
+ ofstream listFileOut;
+ pDataArray->m->openOutputFile(listFileName, listFileOut);
+ list->print(listFileOut);
+ listFileOut.close();
+
+ delete matrix; delete cluster; delete rabund; delete list;
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ vector<int> otuData;
+ vector<int> cumNumSeqs;
+ vector<int> nSeqsPerOTU;
+ vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
+ vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
+ vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
+ vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
+
+
+ //int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
+ /*****************************************************************************************************/
+ ifstream listFile;
+ pDataArray->m->openInputFile(listFileName, listFile);
+ string label;
+ int numOTUs;
+
+ listFile >> label >> numOTUs;
+
+ otuData.assign(numSeqs, 0);
+ cumNumSeqs.assign(numOTUs, 0);
+ nSeqsPerOTU.assign(numOTUs, 0);
+ aaP.clear();aaP.resize(numOTUs);
+
+ seqNumber.clear();
+ aaI.clear();
+ seqIndex.clear();
+
+ string singleOTU = "";
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ listFile >> singleOTU;
+
+ istringstream otuString(singleOTU);
+
+ while(otuString){
+
+ string seqName = "";
+
+ for(int j=0;j<singleOTU.length();j++){
+ char letter = otuString.get();
+
+ if(letter != ','){
+ seqName += letter;
+ }
+ else{
+ map<string,int>::iterator nmIt = nameMap.find(seqName);
+ int index = nmIt->second;
+
+ nameMap.erase(nmIt);
+
+ otuData[index] = i;
+ nSeqsPerOTU[i]++;
+ aaP[i].push_back(index);
+ seqName = "";
+ }
+ }
+
+ map<string,int>::iterator nmIt = nameMap.find(seqName);
+
+ int index = nmIt->second;
+ nameMap.erase(nmIt);
+
+ otuData[index] = i;
+ nSeqsPerOTU[i]++;
+ aaP[i].push_back(index);
+
+ otuString.get();
+ }
+
+ sort(aaP[i].begin(), aaP[i].end());
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber.push_back(aaP[i][j]);
+ }
+ for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
+ aaP[i].push_back(0);
+ }
+
+
+ }
+
+ for(int i=1;i<numOTUs;i++){
+ cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
+ }
+ aaI = aaP;
+ seqIndex = seqNumber;
+
+ listFile.close();
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ pDataArray->m->mothurRemove(distFileName);
+ pDataArray->m->mothurRemove(namesFileName);
+ pDataArray->m->mothurRemove(listFileName);
+
+ vector<double> dist; //adDist - distance of sequences to centroids
+ vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
+ vector<int> centroids; //the representative flowgram for each cluster m
+ vector<double> weight;
+ vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
+ vector<int> nSeqsBreaks;
+ vector<int> nOTUsBreaks;
+
+ dist.assign(numSeqs * numOTUs, 0);
+ change.assign(numOTUs, 1);
+ centroids.assign(numOTUs, -1);
+ weight.assign(numOTUs, 0);
+ singleTau.assign(numSeqs, 1.0);
+
+ nSeqsBreaks.assign(2, 0);
+ nOTUsBreaks.assign(2, 0);
+
+ nSeqsBreaks[0] = 0;
+ nSeqsBreaks[1] = numSeqs;
+ nOTUsBreaks[1] = numOTUs;
if (pDataArray->m->control_pressed) { break; }
- cout << "thread i = " << i << endl;
- for(int j=0;j<i;j++){
+
+ double maxDelta = 0;
+ int iter = 0;
+
+ begClock = clock();
+ begTime = time(NULL);
+
+ pDataArray->m->mothurOut("\nDenoising flowgrams...\n");
+ pDataArray->m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
+
+ while((pDataArray->maxIters == 0 && maxDelta > pDataArray->minDelta) || iter < MIN_ITER || (maxDelta > pDataArray->minDelta && iter < pDataArray->maxIters)){
+
+ if (pDataArray->m->control_pressed) { break; }
- cout << "thread j = " << j << endl;
- float flowDistance = 0.0;
- ////////////////// calcPairwiseDist ///////////////////
- //needed because this is a static global function that can't see the classes internal functions
+ double cycClock = clock();
+ unsigned long long cycTime = time(NULL);
+ //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+ /*****************************************************************************************************/
+ int indexFill = 0;
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ cumNumSeqs[i] = indexFill;
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber[indexFill] = aaP[i][j];
+ seqIndex[indexFill] = aaI[i][j];
+
+ indexFill++;
+ }
+ }
+ /*****************************************************************************************************/
+
- int minLength = pDataArray->lengths[pDataArray->mapSeqToUnique[pDataArray->mapUniqueToSeq[i]]];
- if(pDataArray->lengths[pDataArray->mapUniqueToSeq[j]] < minLength){ minLength = pDataArray->lengths[pDataArray->mapSeqToUnique[pDataArray->mapUniqueToSeq[j]]]; }
+ if (pDataArray->m->control_pressed) { break; }
+
+ //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+ /*****************************************************************************************************/
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ double count = 0;
+ int position = 0;
+ int minFlowGram = 100000000;
+ double minFlowValue = 1e8;
+ change[i] = 0; //FALSE
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ count += singleTau[seqNumber[cumNumSeqs[i] + j]];
+ }
+
+ if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
+ vector<double> adF(nSeqsPerOTU[i]);
+ vector<int> anL(nSeqsPerOTU[i]);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ int nIU = mapSeqToUnique[nI];
+
+ int k;
+ for(k=0;k<position;k++){
+ if(nIU == anL[k]){
+ break;
+ }
+ }
+ if(k == position){
+ anL[position] = nIU;
+ adF[position] = 0.0000;
+ position++;
+ }
+ }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+
+ double tauValue = singleTau[seqNumber[index]];
+
+ for(int k=0;k<position;k++){
+ // double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
+ /*****************************************************************************************************/
+ int flowAValue = anL[k] * numFlowCells;
+ int flowBValue = nI * numFlowCells;
+
+ double dist = 0;
+
+ for(int l=0;l<lengths[nI];l++){
+ dist += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
+ flowAValue++;
+ flowBValue++;
+ }
+
+ dist = dist / (double)lengths[nI];
+ /*****************************************************************************************************/
+ adF[k] += dist * tauValue;
+ }
+ }
+
+ for(int j=0;j<position;j++){
+ if(adF[j] < minFlowValue){
+ minFlowGram = j;
+ minFlowValue = adF[j];
+ }
+ }
+
+ if(centroids[i] != anL[minFlowGram]){
+ change[i] = 1;
+ centroids[i] = anL[minFlowGram];
+ }
+ }
+ else if(centroids[i] != -1){
+ change[i] = 1;
+ centroids[i] = -1;
+ }
+ }
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ //maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
+ /*****************************************************************************************************/
+ double maxChange = 0;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ double difference = weight[i];
+ weight[i] = 0;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ weight[i] += tauValue;
+ }
+
+ difference = fabs(weight[i] - difference);
+ if(difference > maxChange){ maxChange = difference; }
+ }
+ maxDelta = maxChange;
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ //double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
+ /*****************************************************************************************************/
+ vector<long double> P(numSeqs, 0);
+ int effNumOTUs = 0;
+
+ for(int i=0;i<numOTUs;i++){
+ if(weight[i] > MIN_WEIGHT){
+ effNumOTUs++;
+ }
+ }
+
+ string hold;
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ double singleDist = dist[seqNumber[index]];
+
+ P[nI] += weight[i] * exp(-singleDist * pDataArray->sigma);
+ }
+ }
+ double nLL = 0.00;
+ for(int i=0;i<numSeqs;i++){
+ if(P[i] == 0){ P[i] = DBL_EPSILON; }
+
+ nLL += -log(P[i]);
+ }
+
+ nLL = nLL -(double)numSeqs * log(pDataArray->sigma);
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ //checkCentroids(numOTUs, centroids, weight);
+ /*****************************************************************************************************/
+ vector<int> unique(numOTUs, 1);
+
+ for(int i=0;i<numOTUs;i++){
+ if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
+ unique[i] = -1;
+ }
+ }
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ if(unique[i] == 1){
+ for(int j=i+1;j<numOTUs;j++){
+ if(unique[j] == 1){
+
+ if(centroids[j] == centroids[i]){
+ unique[j] = 0;
+ centroids[j] = -1;
+
+ weight[i] += weight[j];
+ weight[j] = 0.0;
+ }
+ }
+ }
+ }
+ }
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
- int ANumFlowCells = pDataArray->mapUniqueToSeq[i] * pDataArray->numFlowCells;
- int BNumFlowCells = pDataArray->mapUniqueToSeq[j] * pDataArray->numFlowCells;
+ //calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
+ /*****************************************************************************************************/
+ int total = 0;
+ vector<double> newTau(numOTUs,0);
+ vector<double> norms(numSeqs, 0);
+ nSeqsPerOTU.assign(numOTUs, 0);
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ int indexOffset = i * numOTUs;
+
+ double offset = 1e8;
+
+ for(int j=0;j<numOTUs;j++){
+
+ if(weight[j] > MIN_WEIGHT && change[j] == 1){
+ //dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
+ /*****************************************************************************************************/
+ int flowAValue = centroids[j] * numFlowCells;
+ int flowBValue = i * numFlowCells;
+
+ double distTemp = 0;
+
+ for(int l=0;l<lengths[i];l++){
+ distTemp += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
+ flowAValue++;
+ flowBValue++;
+ }
+
+ dist[indexOffset + j] = distTemp / (double)lengths[i];
+ /*****************************************************************************************************/
+
+ }
+
+ if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
+ offset = dist[indexOffset + j];
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if(weight[j] > MIN_WEIGHT){
+ newTau[j] = exp(pDataArray->sigma * (-dist[indexOffset + j] + offset)) * weight[j];
+ norms[i] += newTau[j];
+ }
+ else{
+ newTau[j] = 0.0;
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ newTau[j] /= norms[i];
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if(newTau[j] > MIN_TAU){
+
+ int oldTotal = total;
+
+ total++;
+
+ singleTau.resize(total, 0);
+ seqNumber.resize(total, 0);
+ seqIndex.resize(total, 0);
+
+ singleTau[oldTotal] = newTau[j];
+
+ aaP[j][nSeqsPerOTU[j]] = oldTotal;
+ aaI[j][nSeqsPerOTU[j]] = i;
+ nSeqsPerOTU[j]++;
+ }
+ }
+
+ }
+
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
- for(int k=0;k<minLength;k++){
-
- if (pDataArray->m->control_pressed) { break; }
-
- int flowAIntI = pDataArray->flowDataIntI[ANumFlowCells + k];
- float flowAPrI = pDataArray->flowDataPrI[ANumFlowCells + k];
-
- int flowBIntI = pDataArray->flowDataIntI[BNumFlowCells + k];
- float flowBPrI = pDataArray->flowDataPrI[BNumFlowCells + k];
- flowDistance += pDataArray->jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
- }
+ iter++;
- flowDistance /= (float) minLength;
- //cout << flowDistance << endl;
- ////////////////// end of calcPairwiseDist ///////////////////
-
- if(flowDistance < 1e-6){
- outStream << pDataArray->mapUniqueToSeq[i] << '\t' << pDataArray->mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
- }
- else if(flowDistance <= pDataArray->cutoff){
- outStream << pDataArray->mapUniqueToSeq[i] << '\t' << pDataArray->mapUniqueToSeq[j] << '\t' << flowDistance << endl;
- }
- }
- if(i % 100 == 0){
- pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
- pDataArray->m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
- pDataArray->m->mothurOutEndLine();
- }
+ pDataArray->m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
+
+ }
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ pDataArray->m->mothurOut("\nFinalizing...\n");
+ //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+ /*****************************************************************************************************/
+ int indexFill = 0;
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ cumNumSeqs[i] = indexFill;
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber[indexFill] = aaP[i][j];
+ seqIndex[indexFill] = aaI[i][j];
+
+ indexFill++;
+ }
+ }
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ //setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
+ /*****************************************************************************************************/
+ vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ int sIndex = seqIndex[index];
+ bigTauMatrix[sIndex * numOTUs + i] = tauValue;
+ }
+ }
+
+ for(int i=0;i<numSeqs;i++){
+ double maxTau = -1.0000;
+ int maxOTU = -1;
+ for(int j=0;j<numOTUs;j++){
+ if(bigTauMatrix[i * numOTUs + j] > maxTau){
+ maxTau = bigTauMatrix[i * numOTUs + j];
+ maxOTU = j;
+ }
+ }
+
+ otuData[i] = maxOTU;
+ }
+
+ nSeqsPerOTU.assign(numOTUs, 0);
+
+ for(int i=0;i<numSeqs;i++){
+ int index = otuData[i];
+
+ singleTau[i] = 1.0000;
+ dist[i] = 0.0000;
+
+ aaP[index][nSeqsPerOTU[index]] = i;
+ aaI[index][nSeqsPerOTU[index]] = i;
+
+ nSeqsPerOTU[index]++;
+ }
+
+ //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+ /*****************************************************************************************************/
+ indexFill = 0;
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ cumNumSeqs[i] = indexFill;
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber[indexFill] = aaP[i][j];
+ seqIndex[indexFill] = aaI[i][j];
+
+ indexFill++;
+ }
+ }
+ /*****************************************************************************************************/
+
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ vector<int> otuCounts(numOTUs, 0);
+ for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
+
+ //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+ /*****************************************************************************************************/
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ double count = 0;
+ int position = 0;
+ int minFlowGram = 100000000;
+ double minFlowValue = 1e8;
+ change[i] = 0; //FALSE
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ count += singleTau[seqNumber[cumNumSeqs[i] + j]];
+ }
+
+ if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
+ vector<double> adF(nSeqsPerOTU[i]);
+ vector<int> anL(nSeqsPerOTU[i]);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ int nIU = mapSeqToUnique[nI];
+
+ int k;
+ for(k=0;k<position;k++){
+ if(nIU == anL[k]){
+ break;
+ }
+ }
+ if(k == position){
+ anL[position] = nIU;
+ adF[position] = 0.0000;
+ position++;
+ }
+ }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+
+ double tauValue = singleTau[seqNumber[index]];
+
+ for(int k=0;k<position;k++){
+ // double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
+ /*****************************************************************************************************/
+ int flowAValue = anL[k] * numFlowCells;
+ int flowBValue = nI * numFlowCells;
+
+ double dist = 0;
+
+ for(int l=0;l<lengths[nI];l++){
+ dist += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
+ flowAValue++;
+ flowBValue++;
+ }
+
+ dist = dist / (double)lengths[nI];
+ /*****************************************************************************************************/
+ adF[k] += dist * tauValue;
+ }
+ }
+
+ for(int j=0;j<position;j++){
+ if(adF[j] < minFlowValue){
+ minFlowGram = j;
+ minFlowValue = adF[j];
+ }
+ }
+
+ if(centroids[i] != anL[minFlowGram]){
+ change[i] = 1;
+ centroids[i] = anL[minFlowGram];
+ }
+ }
+ else if(centroids[i] != -1){
+ change[i] = 1;
+ centroids[i] = -1;
+ }
+ }
+
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ //writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI);
+ if (pDataArray->m->control_pressed) { break; }
+ /*****************************************************************************************************/
+ string thisOutputDir = pDataArray->outputDir;
+ if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
+ string qualityFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.qual";
+
+ ofstream qualityFile;
+ pDataArray->m->openOutputFile(qualityFileName, qualityFile);
+
+ qualityFile.setf(ios::fixed, ios::floatfield);
+ qualityFile.setf(ios::showpoint);
+ qualityFile << setprecision(6);
+
+ vector<vector<int> > qualities(numOTUs);
+ vector<double> pr(HOMOPS, 0);
+
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ int index = 0;
+ int base = 0;
+
+ if(nSeqsPerOTU[i] > 0){
+ qualities[i].assign(1024, -1);
+
+ while(index < numFlowCells){
+ double maxPrValue = 1e8;
+ short maxPrIndex = -1;
+ double count = 0.0000;
+
+ pr.assign(HOMOPS, 0);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int lIndex = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[lIndex]];
+ int sequenceIndex = aaI[i][j];
+ short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
+
+ count += tauValue;
+
+ for(int s=0;s<HOMOPS;s++){
+ pr[s] += tauValue * pDataArray->singleLookUp[s * NUMBINS + intensity];
+ }
+ }
+
+ maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
+ maxPrValue = pr[maxPrIndex];
+
+ if(count > MIN_COUNT){
+ double U = 0.0000;
+ double norm = 0.0000;
+
+ for(int s=0;s<HOMOPS;s++){
+ norm += exp(-(pr[s] - maxPrValue));
+ }
+
+ for(int s=1;s<=maxPrIndex;s++){
+ int value = 0;
+ double temp = 0.0000;
+
+ U += exp(-(pr[s-1]-maxPrValue))/norm;
+
+ if(U>0.00){
+ temp = log10(U);
+ }
+ else{
+ temp = -10.1;
+ }
+ temp = floor(-10 * temp);
+ value = (int)floor(temp);
+ if(value > 100){ value = 100; }
+
+ qualities[i][base] = (int)value;
+ base++;
+ }
+ }
+
+ index++;
+ }
+ }
+
+
+ if(otuCounts[i] > 0){
+ qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
+
+ int j=4; //need to get past the first four bases
+ while(qualities[i][j] != -1){
+ qualityFile << qualities[i][j] << ' ';
+ j++;
+ }
+ qualityFile << endl;
+ }
+ }
+ qualityFile.close();
+ pDataArray->outputNames.push_back(qualityFileName);
+ /*****************************************************************************************************/
+
+ // writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);
+ if (pDataArray->m->control_pressed) { break; }
+ /*****************************************************************************************************/
+ thisOutputDir = pDataArray->outputDir;
+ if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
+ string fastaFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.fasta";
+ ofstream fastaFile;
+ pDataArray->m->openOutputFile(fastaFileName, fastaFile);
+
+ vector<string> names(numOTUs, "");
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ int index = centroids[i];
+
+ if(otuCounts[i] > 0){
+ fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
+
+ string newSeq = "";
+
+ for(int j=0;j<numFlowCells;j++){
+
+ char base = pDataArray->flowOrder[j % 4];
+ for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
+ newSeq += base;
+ }
+ }
+
+ fastaFile << newSeq.substr(4) << endl;
+ }
+ }
+ fastaFile.close();
+
+ pDataArray->outputNames.push_back(fastaFileName);
+
+ if(pDataArray->thisCompositeFASTAFileName != ""){
+ pDataArray->m->appendFiles(fastaFileName, pDataArray->thisCompositeFASTAFileName);
+ }
+
+ /*****************************************************************************************************/
+
+ //writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);
+ if (pDataArray->m->control_pressed) { break; }
+ /*****************************************************************************************************/
+ thisOutputDir = pDataArray->outputDir;
+ if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
+ string nameFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.names";
+ ofstream nameFileOut;
+ pDataArray->m->openOutputFile(nameFileName, nameFileOut);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ if(otuCounts[i] > 0){
+ nameFileOut << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
+
+ for(int j=1;j<nSeqsPerOTU[i];j++){
+ nameFileOut << ',' << seqNameVector[aaI[i][j]];
+ }
+
+ nameFileOut << endl;
+ }
+ }
+ nameFileOut.close();
+ pDataArray->outputNames.push_back(nameFileName);
+
+
+ if(pDataArray->thisCompositeNameFileName != ""){
+ pDataArray->m->appendFiles(nameFileName, pDataArray->thisCompositeNameFileName);
+ }
+ /*****************************************************************************************************/
+
+ //writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);
+ if (pDataArray->m->control_pressed) { break; }
+ /*****************************************************************************************************/
+ thisOutputDir = pDataArray->outputDir;
+ if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
+ string otuCountsFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.counts";
+ ofstream otuCountsFile;
+ pDataArray->m->openOutputFile(otuCountsFileName, otuCountsFile);
+
+ string bases = pDataArray->flowOrder;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) {
+ break;
+ }
+ //output the translated version of the centroid sequence for the otu
+ if(otuCounts[i] > 0){
+ int index = centroids[i];
+
+ otuCountsFile << "ideal\t";
+ for(int j=8;j<numFlowCells;j++){
+ char base = bases[j % 4];
+ for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
+ otuCountsFile << base;
+ }
+ }
+ otuCountsFile << endl;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int sequence = aaI[i][j];
+ otuCountsFile << seqNameVector[sequence] << '\t';
+
+ string newSeq = "";
+
+ for(int k=0;k<lengths[sequence];k++){
+ char base = bases[k % 4];
+ int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
+
+ for(int s=0;s<freq;s++){
+ newSeq += base;
+ //otuCountsFile << base;
+ }
+ }
+ otuCountsFile << newSeq.substr(4) << endl;
+ }
+ otuCountsFile << endl;
+ }
+ }
+ otuCountsFile.close();
+ pDataArray->outputNames.push_back(otuCountsFileName);
+ /*****************************************************************************************************/
+
+ //writeGroups(flowFileName, numSeqs, seqNameVector);
+ if (pDataArray->m->control_pressed) { break; }
+ /*****************************************************************************************************/
+ thisOutputDir = pDataArray->outputDir;
+ if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
+ string fileRoot = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName));
+ string groupFileName = fileRoot + "shhh.groups";
+ ofstream groupFile;
+ pDataArray->m->openOutputFile(groupFileName, groupFile);
+
+ for(int i=0;i<numSeqs;i++){
+ if (pDataArray->m->control_pressed) { break; }
+ groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
+ }
+ groupFile.close();
+ pDataArray->outputNames.push_back(groupFileName);
+ /*****************************************************************************************************/
+
+ pDataArray->m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
}
- ofstream distFile(pDataArray->distFileName.c_str());
- distFile << outStream.str();
- distFile.close();
-
- if (pDataArray->m->control_pressed) {}
- else {
- pDataArray->m->mothurOut(toString(pDataArray->stopSeq-1) + "\t" + toString(time(NULL) - begTime));
- pDataArray->m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
- pDataArray->m->mothurOutEndLine();
- }
+ if (pDataArray->m->control_pressed) { for (int i = 0; i < pDataArray->outputNames.size(); i++) { pDataArray->m->mothurRemove(pDataArray->outputNames[i]); } return 0; }
+
+ return 0;
}
catch(exception& e) {
- pDataArray->m->errorOut(e, "ShhherCommand", "MyflowDistParentForkThreadFunction");
+ pDataArray->m->errorOut(e, "ShhherCommand", "ShhhFlowsThreadFunction");
exit(1);
}
}
#include "mothur.h"
#include "database.hpp"
#include "suffixtree.hpp"
-//class SuffixTree;
class SuffixDB : public Database {
public:
SuffixDB(int);
SuffixDB();
- SuffixDB(const SuffixDB& sdb) : count(sdb.count), Database(sdb) {
- for (int i = 0; i < sdb.suffixForest.size(); i++) {
- SuffixTree temp(sdb.suffixForest[i]);
- suffixForest.push_back(temp);
- }
- }
~SuffixDB();
void generateDB() {}; //adding sequences generates the db
public:
SuffixNode(int, int, int);
- SuffixNode(const SuffixNode& sn) : parentNode(sn.parentNode), startCharPosition(sn.startCharPosition), endCharPosition(sn.endCharPosition) {m = MothurOut::getInstance();}
virtual ~SuffixNode() {}
virtual void print(string, int) = 0;
virtual void setChildren(char, int);
public:
SuffixBranch(int, int, int);
- SuffixBranch(const SuffixBranch& sb) : suffixNode(sb.suffixNode), childNodes(sb.childNodes), SuffixNode(sb.parentNode, sb.startCharPosition, sb.endCharPosition) {}
~SuffixBranch() {}
void print(string, int); // need a special method for printing the node because there are children
void eraseChild(char); // need a special method for erasing the children
return (left->getParentNode() < right->getParentNode()); // nodes in order of their parent
}
-//********************************************************************************************************************
-
-SuffixTree::SuffixTree(const SuffixTree& st) : root(st.root), activeEndPosition(st.activeEndPosition), activeStartPosition(st.activeStartPosition), activeNode(st.activeNode),
- nodeCounter(st.nodeCounter), seqName(st.seqName), sequence(st.sequence) {
- try {
- m = MothurOut::getInstance();
-
- for (int i = 0; i < st.nodeVector.size(); i++) {
- SuffixNode* temp = new SuffixBranch(*((SuffixBranch*)st.nodeVector[i]));
- nodeVector.push_back(temp);
- }
-
-
- }catch(exception& e) {
- m->errorOut(e, "SuffixTree", "SuffixTree");
- exit(1);
- }
-}
-
//********************************************************************************************************************
SuffixTree::SuffixTree(){ m = MothurOut::getInstance(); }
public:
SuffixTree();
~SuffixTree();
-// SuffixTree(string, string);
- SuffixTree(const SuffixTree&);
void loadSequence(Sequence);
string getSeqName();
exit(1);
}
}
-/**************************************************************************************************/
+**************************************************************************************************/
void Tree::randomBlengths() {
try {
for(int i=numNodes-1;i>=0;i--){
#include "needlemanoverlap.hpp"
+/********************************************************************/
+//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
+TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
+ try {
+ m = MothurOut::getInstance();
+
+ pdiffs = p;
+ bdiffs = b;
+ ldiffs = l;
+ sdiffs = s;
+
+ barcodes = br;
+ primers = pr;
+ revPrimer = r;
+ linker = lk;
+ spacer = sp;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "TrimOligos");
+ exit(1);
+ }
+}
/********************************************************************/
//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
TrimOligos::TrimOligos(int p, int b, map<string, int> pr, map<string, int> br, vector<string> r){
Alignment* alignment;
if (barcodes.size() > 0) {
- map<string,int>::iterator it=barcodes.begin();
+ map<string,int>::iterator it;
- for(it;it!=barcodes.end();it++){
+ for(it=barcodes.begin();it!=barcodes.end();it++){
if(it->first.length() > maxLength){
maxLength = it->first.length();
}
Alignment* alignment;
if (primers.size() > 0) {
- map<string,int>::iterator it=primers.begin();
+ map<string,int>::iterator it;
- for(it;it!=primers.end();it++){
+ for(it=primers.begin();it!=primers.end();it++){
if(it->first.length() > maxLength){
maxLength = it->first.length();
}
}
}
//*******************************************************************/
-int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){
+int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group, bool keepForward){
try {
string rawSequence = seq.getUnaligned();
int success = pdiffs + 1; //guilty until proven innocent
if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
group = it->second;
- seq.setUnaligned(rawSequence.substr(oligo.length()));
+ if (!keepForward) { seq.setUnaligned(rawSequence.substr(oligo.length())); }
if(qual.getName() != ""){
- qual.trimQScores(oligo.length(), -1);
+ if (!keepForward) { qual.trimQScores(oligo.length(), -1); }
}
success = 0;
break;
Alignment* alignment;
if (primers.size() > 0) {
- map<string,int>::iterator it=primers.begin();
+ map<string,int>::iterator it;
- for(it;it!=primers.end();it++){
+ for(it=primers.begin();it!=primers.end();it++){
if(it->first.length() > maxLength){
maxLength = it->first.length();
}
else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
else{ //use the best match
group = minGroup;
- seq.setUnaligned(rawSequence.substr(minPos));
+ if (!keepForward) { seq.setUnaligned(rawSequence.substr(minPos)); }
if(qual.getName() != ""){
- qual.trimQScores(minPos, -1);
+ if (!keepForward) { qual.trimQScores(minPos, -1); }
}
success = minDiff;
}
exit(1);
}
}
+//******************************************************************/
+bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){
+ try {
+ string rawSequence = seq.getUnaligned();
+ bool success = 0; //guilty until proven innocent
+
+ for(int i=0;i<linker.size();i++){
+ string oligo = linker[i];
+
+ if(rawSequence.length() < oligo.length()){
+ success = 0;
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
+ seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
+ if(qual.getName() != ""){
+ qual.trimQScores(-1, rawSequence.length()-oligo.length());
+ }
+ success = 1;
+ break;
+ }
+ }
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripLinker");
+ exit(1);
+ }
+}
+//******************************************************************/
+bool TrimOligos::stripLinker(Sequence& seq){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+ bool success = 0; //guilty until proven innocent
+
+ for(int i=0;i<linker.size();i++){
+ string oligo = linker[i];
+
+ if(rawSequence.length() < oligo.length()){
+ success = 0;
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
+ seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
+ success = 1;
+ break;
+ }
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripLinker");
+ exit(1);
+ }
+}
+
+//******************************************************************/
+bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){
+ try {
+ string rawSequence = seq.getUnaligned();
+ bool success = 0; //guilty until proven innocent
+
+ for(int i=0;i<spacer.size();i++){
+ string oligo = spacer[i];
+
+ if(rawSequence.length() < oligo.length()){
+ success = 0;
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
+ seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
+ if(qual.getName() != ""){
+ qual.trimQScores(-1, rawSequence.length()-oligo.length());
+ }
+ success = 1;
+ break;
+ }
+ }
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripSpacer");
+ exit(1);
+ }
+}
+//******************************************************************/
+bool TrimOligos::stripSpacer(Sequence& seq){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+ bool success = 0; //guilty until proven innocent
+
+ for(int i=0;i<spacer.size();i++){
+ string oligo = spacer[i];
+
+ if(rawSequence.length() < oligo.length()){
+ success = 0;
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
+ seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
+ success = 1;
+ break;
+ }
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripSpacer");
+ exit(1);
+ }
+}
//******************************************************************/
bool TrimOligos::compareDNASeq(string oligo, string seq){
class TrimOligos {
public:
- TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
+ TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
+ TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer
~TrimOligos();
int stripBarcode(Sequence&, int&);
int stripBarcode(Sequence&, QualityScores&, int&);
int stripForward(Sequence&, int&);
- int stripForward(Sequence&, QualityScores&, int&);
+ int stripForward(Sequence&, QualityScores&, int&, bool);
bool stripReverse(Sequence&);
bool stripReverse(Sequence&, QualityScores&);
+
+ bool stripLinker(Sequence&);
+ bool stripLinker(Sequence&, QualityScores&);
+
+ bool stripSpacer(Sequence&);
+ bool stripSpacer(Sequence&, QualityScores&);
private:
- int pdiffs, bdiffs;
+ int pdiffs, bdiffs, ldiffs, sdiffs;
map<string, int> barcodes;
map<string, int> primers;
vector<string> revPrimer;
+ vector<string> linker;
+ vector<string> spacer;
MothurOut* m;
CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
- CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
+ CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
+ CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
+ CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
+ CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward);
CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
- helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
+ helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
+ helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
+ helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
helpString += "The qfile parameter allows you to provide a quality file.\n";
helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
+ helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
m->mothurConvert(temp, pdiffs);
+
+ temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, ldiffs);
+
+ temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, sdiffs);
- temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
+ temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
m->mothurConvert(temp, tdiffs);
- if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
+ if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
temp = validParameter.validFile(parameters, "qfile", true);
if (temp == "not found") { qFileName = ""; }
temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
allFiles = m->isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
+ keepforward = m->isTrue(temp);
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
int count = 0;
bool moreSeqs = 1;
- TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
+ TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
while (moreSeqs) {
int barcodeIndex = 0;
int primerIndex = 0;
+ if(numLinkers != 0){
+ success = trimOligos.stripLinker(currSeq, currQual);
+ if(!success) { trashCode += 'k'; }
+ }
+
if(barcodes.size() != 0){
success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
if(success > bdiffs) { trashCode += 'b'; }
else{ currentSeqsDiffs += success; }
}
+ if(numSpacers != 0){
+ success = trimOligos.stripSpacer(currSeq, currQual);
+ if(!success) { trashCode += 's'; }
+ }
+
if(numFPrimers != 0){
- success = trimOligos.stripForward(currSeq, currQual, primerIndex);
+ success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
if(success > pdiffs) { trashCode += 'f'; }
else{ currentSeqsDiffs += success; }
}
barcodes[oligo]=indexBarcode; indexBarcode++;
barcodeNameVector.push_back(group);
+ }else if(type == "LINKER"){
+ linker.push_back(oligo);
+ }else if(type == "SPACER"){
+ spacer.push_back(oligo);
}
else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
}
}
numFPrimers = primers.size();
numRPrimers = revPrimer.size();
+ numLinkers = linker.size();
+ numSpacers = spacer.size();
bool allBlank = true;
for (int i = 0; i < barcodeNameVector.size(); i++) {
bool abort, createGroup;
string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
- bool flip, allFiles, qtrim;
- int numFPrimers, numRPrimers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, comboStarts;
+ bool flip, allFiles, qtrim, keepforward;
+ int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
int qWindowSize, qWindowStep, keepFirst, removeLast;
double qRollAverage, qThreshold, qWindowAverage, qAverage;
vector<string> revPrimer, outputNames;
map<string, int> barcodes;
vector<string> groupVector;
map<string, int> primers;
+ vector<string> linker;
+ vector<string> spacer;
map<string, int> combos;
map<string, int> groupToIndex;
vector<string> primerNameVector; //needed here?