From: westcott Date: Thu, 14 Jan 2010 16:13:32 +0000 (+0000) Subject: added formatmatrix, formatcolumn, and formatphylip classes. Used these classes in... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=e1cf60b82a48d4d96e3a696a2d221c56cfb0b298 added formatmatrix, formatcolumn, and formatphylip classes. Used these classes in get.oturep to make it able to process distance matrices too large to fit in RAM. added output of .rep.names file to get.oturep command --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 16d9d8e..7f8568e 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -181,6 +181,8 @@ A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; }; A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; }; A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D176CE10F2558500159497 /* pcacommand.cpp */; }; + A7D86C7D10FE09AB00865661 /* formatcolumn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D86C7C10FE09AB00865661 /* formatcolumn.cpp */; }; + A7D86C8C10FE09FE00865661 /* formatphylip.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D86C8B10FE09FE00865661 /* formatphylip.cpp */; }; A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */; }; A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; }; A7E4A824106A9AD700688F62 /* maligner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A823106A9AD700688F62 /* maligner.cpp */; }; @@ -574,6 +576,11 @@ A7B0450D106CEEC90046FC83 /* slayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slayer.cpp; sourceTree = SOURCE_ROOT; }; A7D176CD10F2558500159497 /* pcacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pcacommand.h; sourceTree = SOURCE_ROOT; }; A7D176CE10F2558500159497 /* pcacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pcacommand.cpp; sourceTree = SOURCE_ROOT; }; + A7D86C6A10FE084300865661 /* formatmatrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = formatmatrix.h; sourceTree = SOURCE_ROOT; }; + A7D86C7B10FE09AB00865661 /* formatcolumn.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = formatcolumn.h; sourceTree = SOURCE_ROOT; }; + A7D86C7C10FE09AB00865661 /* formatcolumn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = formatcolumn.cpp; sourceTree = SOURCE_ROOT; }; + A7D86C8A10FE09FE00865661 /* formatphylip.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = formatphylip.h; sourceTree = SOURCE_ROOT; }; + A7D86C8B10FE09FE00865661 /* formatphylip.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = formatphylip.cpp; sourceTree = SOURCE_ROOT; }; A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = ""; }; A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = ""; }; A7E4A781106913F900688F62 /* getsharedotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getsharedotucommand.h; sourceTree = SOURCE_ROOT; }; @@ -737,6 +744,11 @@ 3796441D0FB9B9650081FDB6 /* read */ = { isa = PBXGroup; children = ( + A7D86C6A10FE084300865661 /* formatmatrix.h */, + A7D86C7B10FE09AB00865661 /* formatcolumn.h */, + A7D86C7C10FE09AB00865661 /* formatcolumn.cpp */, + A7D86C8A10FE09FE00865661 /* formatphylip.h */, + A7D86C8B10FE09FE00865661 /* formatphylip.cpp */, A727E84810D14568001A8432 /* readblast.h */, A727E84910D14568001A8432 /* readblast.cpp */, A751ACE81098B283003D0911 /* readcluster.h */, @@ -1311,6 +1323,8 @@ A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */, A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */, A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */, + A7D86C7D10FE09AB00865661 /* formatcolumn.cpp in Sources */, + A7D86C8C10FE09FE00865661 /* formatphylip.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/formatcolumn.cpp b/formatcolumn.cpp new file mode 100644 index 0000000..e68d85d --- /dev/null +++ b/formatcolumn.cpp @@ -0,0 +1,173 @@ +/* + * formatcolumn.cpp + * Mothur + * + * Created by westcott on 1/13/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "formatcolumn.h" +#include "progress.hpp" + +/***********************************************************************/ +FormatColumnMatrix::FormatColumnMatrix(string df) : filename(df){ + openInputFile(filename, fileHandle); +} +/***********************************************************************/ + +void FormatColumnMatrix::read(NameAssignment* nameMap){ + try { + + string firstName, secondName; + float distance; + int nseqs = nameMap->size(); + + list = new ListVector(nameMap->getListVector()); + + Progress* reading = new Progress("Formatting matrix: ", nseqs * nseqs); + + int lt = 1; + int refRow = 0; //we'll keep track of one cell - Cell(refRow,refCol) - and see if it's transpose + int refCol = 0; //shows up later - Cell(refCol,refRow). If it does, then its a square matrix + + //need to see if this is a square or a triangular matrix... + + ofstream out; + string tempOutFile = filename + ".temp"; + openOutputFile(tempOutFile, out); + + while(fileHandle && lt == 1){ //let's assume it's a triangular matrix... + + fileHandle >> firstName >> secondName >> distance; // get the row and column names and distance + + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } + if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + + if (distance == -1) { distance = 1000000; } + + if(distance < cutoff && itA != itB){ + + if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol... + refRow = itA->second; + refCol = itB->second; + + //making it square + out << itA->second << '\t' << itB->second << '\t' << distance << endl; + out << itB->second << '\t' << itA->second << '\t' << distance << endl; + } + else if(refRow == itA->second && refCol == itB->second){ lt = 0; } //you are square + else if(refRow == itB->second && refCol == itA->second){ lt = 0; } //you are square + else{ //making it square + out << itA->second << '\t' << itB->second << '\t' << distance << endl; + out << itB->second << '\t' << itA->second << '\t' << distance << endl; + } + + reading->update(itA->second * nseqs / 2); + } + gobble(fileHandle); + } + out.close(); + fileHandle.close(); + + string squareFile; + if(lt == 0){ // oops, it was square + squareFile = filename; + }else{ squareFile = tempOutFile; } + + //sort file by first column so the distances for each row are together + string outfile = getRootName(squareFile) + "sorted.dist.temp"; + + //use the unix sort + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + string command = "sort -n " + squareFile + " -o " + outfile; + system(command.c_str()); + #else //sort using windows sort + string command = "sort " + squareFile + " /O " + outfile; + system(command.c_str()); + #endif + + + //output to new file distance for each row and save positions in file where new row begins + ifstream in; + openInputFile(outfile, in); + + distFile = outfile + ".rowFormatted"; + openOutputFile(distFile, out); + + rowPos.resize(nseqs, -1); + int currentRow; + int first, second; + float dist; + map rowMap; + map::iterator itRow; + + //get first currentRow + in >> first; + currentRow = first; + + string firstString = toString(first); + for(int k = 0; k < firstString.length(); k++) { in.putback(firstString[k]); } + + while(!in.eof()) { + in >> first >> second >> dist; gobble(in); + + if (first != currentRow) { + //save position in file of each new row + rowPos[currentRow] = out.tellp(); + + out << currentRow << '\t' << rowMap.size() << '\t'; + + for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) { + out << itRow->first << '\t' << itRow->second << '\t'; + } + out << endl; + + currentRow = first; + rowMap.clear(); + + //save row you just read + rowMap[second] = dist; + + }else{ + rowMap[second] = dist; + } + } + + //print last Row + //save position in file of each new row + rowPos[currentRow] = out.tellp(); + + out << currentRow << '\t' << rowMap.size() << '\t'; + + for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) { + out << itRow->first << '\t' << itRow->second << '\t'; + } + out << endl; + + + in.close(); + out.close(); + + + remove(tempOutFile.c_str()); + remove(outfile.c_str()); + + reading->finish(); + list->setLabel("0"); + + } + catch(exception& e) { + errorOut(e, "FormatColumnMatrix", "read"); + exit(1); + } +} + +/***********************************************************************/ +FormatColumnMatrix::~FormatColumnMatrix(){} +/***********************************************************************/ + + + diff --git a/formatcolumn.h b/formatcolumn.h new file mode 100644 index 0000000..c713867 --- /dev/null +++ b/formatcolumn.h @@ -0,0 +1,32 @@ +#ifndef FORMATCOLUMN_H +#define FORMATCOLUMN_H +/* + * formatcolumn.h + * Mothur + * + * Created by westcott on 1/13/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "formatmatrix.h" + +/******************************************************/ + +class FormatColumnMatrix : public FormatMatrix { + +public: + FormatColumnMatrix(string); + ~FormatColumnMatrix(); + void read(NameAssignment*); + +private: + ifstream fileHandle; + string filename; + +}; + +/******************************************************/ + +#endif + diff --git a/formatmatrix.h b/formatmatrix.h new file mode 100644 index 0000000..a02ef6c --- /dev/null +++ b/formatmatrix.h @@ -0,0 +1,43 @@ +#ifndef FORMATMATRIX_H +#define FORMATMATRIX_H + +/* + * formatmatrix.h + * Mothur + * + * Created by westcott on 1/13/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "listvector.hpp" +#include "nameassignment.hpp" + + +//********************************************************************************************************************** + +class FormatMatrix { + +public: + FormatMatrix(){ } + virtual ~FormatMatrix() {} + + virtual void read(NameAssignment*){}; + + void setCutoff(float c) { cutoff = c; } + ListVector* getListVector() { return list; } + string getFormattedFileName() { return distFile; } + vector getRowPositions() { return rowPos; } + +protected: + ListVector* list; + float cutoff; + string distFile; + vector rowPos; +}; + +//********************************************************************************************************************** + +#endif + diff --git a/formatphylip.cpp b/formatphylip.cpp new file mode 100644 index 0000000..4a7878a --- /dev/null +++ b/formatphylip.cpp @@ -0,0 +1,218 @@ +/* + * formatphylip.cpp + * Mothur + * + * Created by westcott on 1/13/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "formatphylip.h" +#include "progress.hpp" + +/***********************************************************************/ +FormatPhylipMatrix::FormatPhylipMatrix(string df) : filename(df) { + openInputFile(filename, fileHandle); +} +/***********************************************************************/ +//not using nameMap +void FormatPhylipMatrix::read(NameAssignment* nameMap){ + try { + + float distance; + int square, nseqs; + string name; + ofstream out; + + fileHandle >> nseqs >> name; + + list = new ListVector(nseqs); + list->set(0, name); + + char d; + while((d=fileHandle.get()) != EOF){ + + if(isalnum(d)){ //you are square + square = 1; + fileHandle.close(); //reset file + + //open and get through numSeqs, code below formats rest of file + openInputFile(filename, fileHandle); + fileHandle >> nseqs; gobble(fileHandle); + + distFile = filename + ".rowFormatted"; + openOutputFile(distFile, out); + break; + } + if(d == '\n'){ + square = 0; + break; + } + } + + Progress* reading; + reading = new Progress("Formatting matrix: ", nseqs * nseqs); + + //lower triangle, so must go to column then formatted row file + if(square == 0){ + int index = 0; + + ofstream outTemp; + string tempFile = filename + ".temp"; + openOutputFile(tempFile, outTemp); + + //convert to square column matrix + for(int i=1;i> name; + + list->set(i, name); + + for(int j=0;j> distance; + + if (distance == -1) { distance = 1000000; } + + if(distance < cutoff){ + outTemp << i << '\t' << j << '\t' << distance << endl; + outTemp << j << '\t' << i << '\t' << distance << endl; + } + index++; + reading->update(index); + } + } + outTemp.close(); + + //format from square column to rowFormatted + //sort file by first column so the distances for each row are together + string outfile = getRootName(tempFile) + "sorted.dist.temp"; + + //use the unix sort + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + string command = "sort -n " + tempFile + " -o " + outfile; + system(command.c_str()); + #else //sort using windows sort + string command = "sort " + tempFile + " /O " + outfile; + system(command.c_str()); + #endif + + + //output to new file distance for each row and save positions in file where new row begins + ifstream in; + openInputFile(outfile, in); + + distFile = outfile + ".rowFormatted"; + openOutputFile(distFile, out); + + rowPos.resize(nseqs, -1); + int currentRow; + int first, second; + float dist; + map rowMap; + map::iterator itRow; + + //get first currentRow + in >> first; + currentRow = first; + + string firstString = toString(first); + for(int k = 0; k < firstString.length(); k++) { in.putback(firstString[k]); } + + while(!in.eof()) { + in >> first >> second >> dist; gobble(in); + + if (first != currentRow) { + //save position in file of each new row + rowPos[currentRow] = out.tellp(); + + out << currentRow << '\t' << rowMap.size() << '\t'; + + for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) { + out << itRow->first << '\t' << itRow->second << '\t'; + } + out << endl; + + currentRow = first; + rowMap.clear(); + + //save row you just read + rowMap[second] = dist; + + index++; + reading->update(index); + }else{ + rowMap[second] = dist; + } + } + + //print last Row + //save position in file of each new row + rowPos[currentRow] = out.tellp(); + + out << currentRow << '\t' << rowMap.size() << '\t'; + + for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) { + out << itRow->first << '\t' << itRow->second << '\t'; + } + out << endl; + + in.close(); + out.close(); + + remove(tempFile.c_str()); + remove(outfile.c_str()); + } + else{ //square matrix convert directly to formatted row file + int index = nseqs; + map rowMap; + map::iterator itRow; + + + for(int i=0;i> name; + + list->set(i, name); + + for(int j=0;j> distance; + + if (distance == -1) { distance = 1000000; } + + if((distance < cutoff) && (j != i)){ + rowMap[j] = distance; + } + index++; + reading->update(index); + } + + //save position in file of each new row + rowPos[i] = out.tellp(); + + //output row to file + out << i << '\t' << rowMap.size() << '\t'; + for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) { + out << itRow->first << '\t' << itRow->second << '\t'; + } + out << endl; + + //clear map for new row's info + rowMap.clear(); + } + } + reading->finish(); + delete reading; + + list->setLabel("0"); + fileHandle.close(); + out.close(); + + } + catch(exception& e) { + errorOut(e, "FormatPhylipMatrix", "read"); + exit(1); + } +} +/***********************************************************************/ +FormatPhylipMatrix::~FormatPhylipMatrix(){} +/***********************************************************************/ + + diff --git a/formatphylip.h b/formatphylip.h new file mode 100644 index 0000000..9ca14bb --- /dev/null +++ b/formatphylip.h @@ -0,0 +1,31 @@ +#ifndef FORMATPHYLIP_H +#define FORMATPHYLIP_H + +/* + * formatphylip.h + * Mothur + * + * Created by westcott on 1/13/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "formatmatrix.h" + +/******************************************************/ + +class FormatPhylipMatrix : public FormatMatrix { + +public: + FormatPhylipMatrix(string); + ~FormatPhylipMatrix(); + void read(NameAssignment*); +private: + ifstream fileHandle; + string filename; +}; + +/******************************************************/ + +#endif + diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp index 41cc6b5..9667755 100644 --- a/getoturepcommand.cpp +++ b/getoturepcommand.cpp @@ -8,6 +8,12 @@ */ #include "getoturepcommand.h" +#include "readphylip.h" +#include "readcolumn.h" +#include "formatphylip.h" +#include "formatcolumn.h" + + //******************************************************************************************************************** //sorts lowest to highest @@ -42,7 +48,7 @@ GetOTURepCommand::GetOTURepCommand(string option){ help(); abort = true; } else { //valid paramters for this command - string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column"}; + string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -55,12 +61,6 @@ GetOTURepCommand::GetOTURepCommand(string option){ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } } - //make sure the user has already run the read.otu command - if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) { - mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine(); - abort = true; - } - //check for required parameters fastafile = validParameter.validFile(parameters, "fasta", true); if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; } @@ -73,10 +73,16 @@ GetOTURepCommand::GetOTURepCommand(string option){ phylipfile = validParameter.validFile(parameters, "phylip", true); if (phylipfile == "not found") { phylipfile = ""; } else if (phylipfile == "not open") { abort = true; } + else { distFile = phylipfile; format = "phylip"; } columnfile = validParameter.validFile(parameters, "column", true); if (columnfile == "not found") { columnfile = ""; } else if (columnfile == "not open") { abort = true; } + else { distFile = columnfile; format = "column"; } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { abort = true; } + else if (namefile == "not found") { namefile = ""; } if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; } else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; } @@ -86,22 +92,12 @@ GetOTURepCommand::GetOTURepCommand(string option){ //check for optional parameter and set defaults // ...at some point should added some additional type checking... label = validParameter.validFile(parameters, "label", false); - if (label == "not found") { label = ""; } + if (label == "not found") { label = ""; allLines = 1; } else { if(label != "all") { splitAtDash(label, labels); allLines = 0; } else { allLines = 1; } } - //if the user has not specified any labels use the ones from read.otu - if (label == "") { - allLines = globaldata->allLines; - labels = globaldata->labels; - } - - namesfile = validParameter.validFile(parameters, "name", true); - if (namesfile == "not open") { abort = true; } - else if (namesfile == "not found") { namesfile = ""; } - groupfile = validParameter.validFile(parameters, "group", true); if (groupfile == "not open") { groupfile = ""; abort = true; } else if (groupfile == "not found") { groupfile = ""; } @@ -122,27 +118,15 @@ GetOTURepCommand::GetOTURepCommand(string option){ sorted = ""; } - if (abort == false) { - - //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix - if (globaldata->gListVector != NULL) { - vector names; - string binnames; - //map names to rows in sparsematrix - for (int i = 0; i < globaldata->gListVector->size(); i++) { - names.clear(); - binnames = globaldata->gListVector->get(i); - - splitAtComma(binnames, names); - - for (int j = 0; j < names.size(); j++) { - nameToIndex[names[j]] = i; - } - } - } else { mothurOut("error, no listvector."); mothurOutEndLine(); } - - fasta = new FastaMap(); - } + string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; } + large = isTrue(temp); + + temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; } + convert(temp, precision); + + temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; } + convert(temp, cutoff); + cutoff += (5 / (precision * 10.0)); } } catch(exception& e) { @@ -156,13 +140,14 @@ GetOTURepCommand::GetOTURepCommand(string option){ void GetOTURepCommand::help(){ try { mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n"); - mothurOut("The get.oturep command parameters are list, fasta, name, group, sorted and label. The fasta and list parameters are required.\n"); + mothurOut("The get.oturep command parameters are list, fasta, name, group, large, sorted and label. The fasta and list parameters are required.\n"); mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n"); mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n"); mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n"); mothurOut("The default value for label is all labels in your inputfile.\n"); mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n"); - mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n"); + mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n"); + mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n"); mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n"); mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n"); } @@ -174,16 +159,7 @@ void GetOTURepCommand::help(){ //********************************************************************************************************************** -GetOTURepCommand::~GetOTURepCommand(){ - if (abort == false) { - delete input; globaldata->ginput = NULL; - delete read; - delete fasta; - if (groupfile != "") { - delete groupMap; globaldata->gGroupmap = NULL; - } - } -} +GetOTURepCommand::~GetOTURepCommand(){} //********************************************************************************************************************** @@ -193,15 +169,93 @@ int GetOTURepCommand::execute(){ if (abort == true) { return 0; } int error; - //read fastafile - fasta->readFastaFile(fastafile); + if (!large) { + //read distance files + if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); } + else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); } + else { mothurOut("File format error."); mothurOutEndLine(); return 0; } + + readMatrix->setCutoff(cutoff); + + if(namefile != ""){ + nameMap = new NameAssignment(namefile); + nameMap->readMap(); + }else{ nameMap = NULL; } + + readMatrix->read(nameMap); + + //get matrix + if (globaldata->gListVector != NULL) { delete globaldata->gListVector; } + globaldata->gListVector = readMatrix->getListVector(); + + SparseMatrix* matrix = readMatrix->getMatrix(); + + // Create a data structure to quickly access the distance information. + // It consists of a vector of distance maps, where each map contains + // all distances of a certain sequence. Vector and maps are accessed + // via the index of a sequence in the distance matrix + seqVec = vector(globaldata->gListVector->size()); + for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) { + seqVec[currentCell->row][currentCell->column] = currentCell->dist; + } + + delete matrix; + delete readMatrix; + }else { + //process file and set up indexes + if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); } + else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); } + else { mothurOut("File format error."); mothurOutEndLine(); return 0; } + + formatMatrix->setCutoff(cutoff); + + if(namefile != ""){ + nameMap = new NameAssignment(namefile); + nameMap->readMap(); + }else{ nameMap = NULL; } + + formatMatrix->read(nameMap); + + //get matrix + if (globaldata->gListVector != NULL) { delete globaldata->gListVector; } + globaldata->gListVector = formatMatrix->getListVector(); + + distFile = formatMatrix->getFormattedFileName(); + + //positions in file where the distances for each sequence begin + //rowPositions[1] = position in file where distance related to sequence 1 start. + rowPositions = formatMatrix->getRowPositions(); + + delete formatMatrix; + + //openfile for getMap to use + openInputFile(distFile, inRow); + } - //in.close(); - //read distance file and generate indexed distance file that can be used by findrep function -//....new reading class....// + //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix + if (globaldata->gListVector != NULL) { + vector names; + string binnames; + //map names to rows in sparsematrix + for (int i = 0; i < globaldata->gListVector->size(); i++) { + names.clear(); + binnames = globaldata->gListVector->get(i); + + splitAtComma(binnames, names); + + for (int j = 0; j < names.size(); j++) { + nameToIndex[names[j]] = i; + } + } + } else { mothurOut("error, no listvector."); mothurOutEndLine(); } + fasta = new FastaMap(); + + //read fastafile + fasta->readFastaFile(fastafile); + //if user gave a namesfile then use it - if (namesfile != "") { readNamesFile(); } + if (namefile != "") { readNamesFile(); } //set format to list so input can get listvector globaldata->setFormat("list"); @@ -217,7 +271,7 @@ int GetOTURepCommand::execute(){ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. set processedLabels; set userLabels = labels; - + while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { if (allLines == 1 || labels.count(list->getLabel()) == 1){ @@ -273,9 +327,18 @@ int GetOTURepCommand::execute(){ if (error == 1) { return 0; } //there is an error in hte input files, abort command } - delete list; + //close and remove formatted matrix file + inRow.close(); + remove(distFile.c_str()); + globaldata->gListVector = NULL; - + delete input; globaldata->ginput = NULL; + delete read; + delete fasta; + if (groupfile != "") { + delete groupMap; globaldata->gGroupmap = NULL; + } + return 0; } catch(exception& e) { @@ -288,7 +351,7 @@ int GetOTURepCommand::execute(){ void GetOTURepCommand::readNamesFile() { try { vector dupNames; - openInputFile(namesfile, inNames); + openInputFile(namefile, inNames); string name, names, sequence; @@ -354,37 +417,58 @@ string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, i // the first sequence of the OTU is the representative one if ((names.size() == 1) || (list->getLabel() == "unique")) { return names[0]; - } else { + }else{ vector seqIndex(names.size()); vector max_dist(names.size()); + vector total_dist(names.size()); //fill seqIndex and initialize sums for (size_t i = 0; i < names.size(); i++) { seqIndex[i] = nameToIndex[names[i]]; max_dist[i] = 0.0; + total_dist[i] = 0.0; } // loop through all entries in seqIndex SeqMap::iterator it; SeqMap currMap; for (size_t i=0; i < seqIndex.size(); i++) { - //currMap = seqVec[seqIndex[i]]; + + if (!large) { currMap = seqVec[seqIndex[i]]; } + else { currMap = getMap(seqIndex[i]); } + for (size_t j=0; j < seqIndex.size(); j++) { it = currMap.find(seqIndex[j]); if (it != currMap.end()) { max_dist[i] = max(max_dist[i], it->second); max_dist[j] = max(max_dist[j], it->second); + total_dist[i] += it->second; + total_dist[j] += it->second; + }else{ //if you can't find the distance make it the cutoff + max_dist[i] = max(max_dist[i], cutoff); + max_dist[j] = max(max_dist[j], cutoff); + total_dist[i] += cutoff; + total_dist[j] += cutoff; } } } // sequence with the smallest maximum distance is the representative + //if tie occurs pick sequence with smallest average distance float min = 10000; int minIndex; for (size_t i=0; i < max_dist.size(); i++) { if (max_dist[i] < min) { min = max_dist[i]; minIndex = i; + }else if (max_dist[i] == min) { + float currentAverage = total_dist[minIndex] / (float) total_dist.size(); + float newAverage = total_dist[i] / (float) total_dist.size(); + + if (newAverage < currentAverage) { + min = max_dist[i]; + minIndex = i; + } } } @@ -406,12 +490,19 @@ int GetOTURepCommand::process(ListVector* processList) { string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta"; openOutputFile(outputFileName, out); vector reps; - + + ofstream newNamesOutput; + string outputNamesFile = getRootName(listfile) + processList->getLabel() + ".rep.names"; + openOutputFile(outputNamesFile, newNamesOutput); + //for each bin in the list vector for (int i = 0; i < processList->size(); i++) { string groups; int binsize; nameRep = findRep(i, groups, processList, binsize); + + //output to new names file + newNamesOutput << nameRep << '\t' << processList->get(i) << endl; //print out name and sequence for that bin sequence = fasta->getSequence(nameRep); @@ -432,6 +523,7 @@ int GetOTURepCommand::process(ListVector* processList) { }else { mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine(); remove(outputFileName.c_str()); + remove(outputNamesFile.c_str()); return 1; } } @@ -456,6 +548,7 @@ int GetOTURepCommand::process(ListVector* processList) { } out.close(); + newNamesOutput.close(); return 0; } @@ -466,3 +559,33 @@ int GetOTURepCommand::process(ListVector* processList) { } //********************************************************************************************************************** +SeqMap GetOTURepCommand::getMap(int row) { + try { + SeqMap rowMap; + + //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff + if (rowPositions[row] != -1){ + //go to row in file + inRow.seekg(rowPositions[row]); + + int rowNum, numDists, colNum; + float dist; + + inRow >> rowNum >> numDists; + + for(int i = 0; i < numDists; i++) { + inRow >> colNum >> dist; + rowMap[colNum] = dist; + + } + } + + return rowMap; + } + catch(exception& e) { + errorOut(e, "GetOTURepCommand", "getMap"); + exit(1); + } +} +//********************************************************************************************************************** + diff --git a/getoturepcommand.h b/getoturepcommand.h index 58fec8d..0e8a121 100644 --- a/getoturepcommand.h +++ b/getoturepcommand.h @@ -18,6 +18,8 @@ #include "readotu.h" #include "fastamap.h" #include "groupmap.h" +#include "readmatrix.hpp" +#include "formatmatrix.h" typedef list::iterator MatData; typedef map SeqMap; @@ -48,15 +50,24 @@ private: InputData* input; FastaMap* fasta; GroupMap* groupMap; - string filename, fastafile, listfile, namesfile, groupfile, label, sorted, phylipfile, columnfile, namefile; + ReadMatrix* readMatrix; + FormatMatrix* formatMatrix; + NameAssignment* nameMap; + string filename, fastafile, listfile, namefile, groupfile, label, sorted, phylipfile, columnfile, distFile, format; ofstream out; - ifstream in, inNames; - bool abort, allLines, groupError; + ifstream in, inNames, inRow; + bool abort, allLines, groupError, large; set labels; //holds labels to be used map nameToIndex; //maps sequence name to index in sparsematrix + float cutoff; + int precision; + vector seqVec; // contains maps with sequence index and distance + // for all distances related to a certain sequence + vector rowPositions; void readNamesFile(); int process(ListVector*); + SeqMap getMap(int); string findRep(int, string&, ListVector*, int&); // returns the name of the "representative" sequence of given bin, // fills a string containing the groups in that bin if a groupfile is given, // and returns the number of sequences in the given bin diff --git a/getsharedotucommand.cpp b/getsharedotucommand.cpp index 8081f10..f224b47 100644 --- a/getsharedotucommand.cpp +++ b/getsharedotucommand.cpp @@ -357,7 +357,7 @@ void GetSharedOTUCommand::process(ListVector* shared) { } outFasta << seqs[k].getAligned() << endl; - } + }else { mothurOut(seqs[k].getName() + " is not in your fasta file. Please correct."); mothurOutEndLine(); } } outFasta.close(); diff --git a/readdistcommand.cpp b/readdistcommand.cpp index 7e81d7f..1c134f5 100644 --- a/readdistcommand.cpp +++ b/readdistcommand.cpp @@ -206,7 +206,7 @@ int ReadDistCommand::execute(){ if (globaldata->gSparseMatrix != NULL) { delete globaldata->gSparseMatrix; } globaldata->gSparseMatrix = read->getMatrix(); numDists = globaldata->gSparseMatrix->getNNodes(); - cout << "matrix contains " << numDists << " distances." << endl; + //cout << "matrix contains " << numDists << " distances." << endl; int lines = cutoff / (1.0/precision); vector dist_cutoff(lines+1,0);