A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; };
A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; };
A70DECD91063D8B40057C03C /* secondarystructurecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70DECD81063D8B40057C03C /* secondarystructurecommand.cpp */; };
+ A727E84A10D14568001A8432 /* readblast.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727E84910D14568001A8432 /* readblast.cpp */; };
A7283FF81056CAE100D0CC69 /* chimeracheckrdp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */; };
A729ACD010848E6100139801 /* hclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A729ACCF10848E6100139801 /* hclustercommand.cpp */; };
A729ACE410849BBE00139801 /* hcluster.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A729ACE310849BBE00139801 /* hcluster.cpp */; };
A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; };
A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; };
A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; };
+ A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */; };
A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; };
A7E4A824106A9AD700688F62 /* maligner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A823106A9AD700688F62 /* maligner.cpp */; };
A7F5759710CEBC0600E20E47 /* libreadline.a in Frameworks */ = {isa = PBXBuildFile; fileRef = A7F5759610CEBC0600E20E47 /* libreadline.a */; };
A70B53A90F4CD7AD0064797E /* getlinecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlinecommand.h; sourceTree = SOURCE_ROOT; };
A70DECD71063D8B40057C03C /* secondarystructurecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = secondarystructurecommand.h; sourceTree = SOURCE_ROOT; };
A70DECD81063D8B40057C03C /* secondarystructurecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = secondarystructurecommand.cpp; sourceTree = SOURCE_ROOT; };
+ A727E84810D14568001A8432 /* readblast.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readblast.h; sourceTree = SOURCE_ROOT; };
+ A727E84910D14568001A8432 /* readblast.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readblast.cpp; sourceTree = SOURCE_ROOT; };
A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckrdp.h; sourceTree = SOURCE_ROOT; };
A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckrdp.cpp; sourceTree = SOURCE_ROOT; };
A729ACCE10848E6100139801 /* hclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hclustercommand.h; sourceTree = SOURCE_ROOT; };
A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; };
A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; };
A7B0450D106CEEC90046FC83 /* slayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slayer.cpp; sourceTree = SOURCE_ROOT; };
+ A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = "<group>"; };
+ A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = "<group>"; };
A7E4A781106913F900688F62 /* getsharedotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getsharedotucommand.h; sourceTree = SOURCE_ROOT; };
A7E4A782106913F900688F62 /* getsharedotucommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsharedotucommand.cpp; sourceTree = SOURCE_ROOT; };
A7E4A822106A9AD700688F62 /* maligner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = maligner.h; sourceTree = SOURCE_ROOT; };
3796441D0FB9B9650081FDB6 /* read */ = {
isa = PBXGroup;
children = (
+ A727E84810D14568001A8432 /* readblast.h */,
+ A727E84910D14568001A8432 /* readblast.cpp */,
A751ACE81098B283003D0911 /* readcluster.h */,
A751ACE91098B283003D0911 /* readcluster.cpp */,
375AA1340F9E433D008EF9B8 /* readcolumn.h */,
21E859D70FC4632E005E1A48 /* matrixoutputcommand.cpp */,
7E5A17AD0FE57292003C6A03 /* mergefilecommand.h */,
7E5A17AE0FE57292003C6A03 /* mergefilecommand.cpp */,
+ A7E0AAB410D2886D00CF407B /* mgclustercommand.h */,
+ A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */,
375873F70F7D649C0040F377 /* nocommands.h */,
375873F60F7D649C0040F377 /* nocommands.cpp */,
3792946E0F2E191800B9034A /* parsimonycommand.h */,
A787844510A1EBDD0086103D /* knn.cpp in Sources */,
A773805F10B6D6FC002A6A61 /* taxonomyequalizer.cpp in Sources */,
A773809010B6EFD1002A6A61 /* phylotypecommand.cpp in Sources */,
+ A727E84A10D14568001A8432 /* readblast.cpp in Sources */,
+ A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
while(!inFASTA.eof()){
input = getline(inFASTA);
if (input.length() != 0) {
- if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
+ if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
}
}
inFASTA.close();
numFastaSeqs = positions.size();
-
+
int numSeqsPerProcessor = numFastaSeqs / processors;
for (int i = 0; i < processors; i++) {
- int startPos = positions[ i * numSeqsPerProcessor ];
+ long int startPos = positions[ i * numSeqsPerProcessor ];
if(i == processors - 1){
numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
}
lines.push_back(new linePair(startPos, numSeqsPerProcessor));
}
+
createProcesses(alignFileName, reportFileName, accnosFileName);
rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str());
for(int i=0;i<line->numSeqs;i++){
- Sequence* candidateSeq = new Sequence(inFASTA);
+ Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
int origNumBases = candidateSeq->getNumBases();
string originalUnaligned = candidateSeq->getUnaligned();
int numBasesNeeded = origNumBases * threshold;
-
+
if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
if (candidateSeq->getUnaligned().length() > alignment->getnRows()) {
alignment->resize(candidateSeq->getUnaligned().length()+1);
struct linePair {
int start;
int numSeqs;
- linePair(int i, int j) : start(i), numSeqs(j) {}
+ linePair(long int i, int j) : start(i), numSeqs(j) {}
};
vector<int> processIDS; //processid
vector<linePair*> lines;
if (temp.getName() != "") {
templateSequences.push_back(temp);
//save longest base
- if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length(); }
+ if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length()+1; }
}
}
seqVec[currentCell->row].push_back(currentCell);
seqVec[currentCell->column].push_back(currentCell);
}
+ mapWanted = false; //set to true by mgcluster to speed up overlap merge
}
/***********************************************************************/
}
}
-
+/***********************************************************************/
// Remove the specified cell from the seqVec and from the sparse
// matrix
void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix)
void Cluster::clusterNames(){
try {
// cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
-
+ if (mapWanted) { updateMap(); }
+
list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
list->set(smallRow, "");
list->setLabel(toString(smallDist));
exit(1);
}
}
+/***********************************************************************/
+void Cluster::setMapWanted(bool m) {
+ try {
+ mapWanted = m;
+
+ //initialize map
+ for (int i = 0; i < list->getNumBins(); i++) {
+
+ //parse bin
+ string names = list->get(i);
+ while (names.find_first_of(',') != -1) {
+ //get name from bin
+ string name = names.substr(0,names.find_first_of(','));
+ //save name and bin number
+ seq2Bin[name] = i;
+ names = names.substr(names.find_first_of(',')+1, names.length());
+ }
+
+ //get last name
+ seq2Bin[names] = i;
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Cluster", "setMapWanted");
+ exit(1);
+ }
+}
+/***********************************************************************/
+void Cluster::updateMap() {
+try {
+ //update location of seqs in smallRow since they move to smallCol now
+ string names = list->get(smallRow);
+ while (names.find_first_of(',') != -1) {
+ //get name from bin
+ string name = names.substr(0,names.find_first_of(','));
+ //save name and bin number
+ seq2Bin[name] = smallCol;
+ names = names.substr(names.find_first_of(',')+1, names.length());
+ }
+
+ //get last name
+ seq2Bin[names] = smallCol;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Cluster", "updateMap");
+ exit(1);
+ }
+}
+/***********************************************************************/
-/***********************************************************************/
public:
Cluster(RAbundVector*, ListVector*, SparseMatrix*);
- virtual void update();
+ virtual void update();
virtual string getTag() = 0;
+ virtual void setMapWanted(bool m);
+ virtual map<string, int> getSeqtoBin() { return seq2Bin; }
protected:
void getRowColCells();
virtual void clusterBins();
virtual void clusterNames();
+ virtual void updateMap();
RAbundVector* rabund;
ListVector* list;
int smallRow;
int smallCol;
float smallDist;
+ bool mapWanted;
+ map<string, int> seq2Bin;
- vector<MatVec> seqVec; // contains vectors of cells related to a certain sequence\r
+ vector<MatVec> seqVec; // contains vectors of cells related to a certain sequence
MatVec rowCells;
MatVec colCells;
ull nRowCells;
#include "hclustercommand.h"
#include "classifyseqscommand.h"
#include "phylotypecommand.h"
+#include "mgclustercommand.h"
/*******************************************************/
commands["hcluster"] = "hcluster";
commands["classify.seqs"] = "classify.seqs";
commands["phylotype"] = "phylotype";
+ commands["mgcluster"] = "mgcluster";
}
/***********************************************************/
else if(commandName == "hcluster") { command = new HClusterCommand(optionString); }
else if(commandName == "classify.seqs") { command = new ClassifySeqsCommand(optionString); }
else if(commandName == "phylotype") { command = new PhylotypeCommand(optionString); }
+ else if(commandName == "mgcluster") { command = new MGClusterCommand(optionString); }
else { command = new NoCommand(optionString); }
return command;
#include "engine.hpp"
-/***********************************************************************/
-inline void terminateCommand(int dummy) {
-
- //mothurOut("Stopping command....");
- //CommandFactory* cFactory = CommandFactory::getInstance();
- //cFactory->getCommand(); //deletes old command and makes new no command.
- //this may cause memory leak if old commands execute function allocated memory
- //that is freed in the execute function and not the deconstructor
- //mothurOut("DONE."); mothurOutEndLine();
-}
/***********************************************************************/
Engine::Engine(){
try {
virtual bool getInput() = 0;
virtual string getCommand();
vector<string> getOptions() { return options; }
- //virtual void terminateCommand(int);
protected:
vector<string> options;
CommandFactory* cFactory;
int getNumGroups();
void printMatrix(ostream&);
float get(int, int);
+ Names getRowInfo(int row) { return index[row]; }
private:
vector< vector<float> > matrix; //a 2D distance matrix of all the sequences and their distances to eachother.
globaldata = GlobalData::getInstance();
abort = false;
+ unique = true;
allLines = 1;
labels.clear();
else {
//valid paramters for this command
- string Array[] = {"label","groups","fasta","list","group","output"};
+ string Array[] = {"label","unique","shared","fasta","list","group","output"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
output = validParameter.validFile(parameters, "output", false);
if (output == "not found") { output = ""; }
- groups = validParameter.validFile(parameters, "groups", false);
+ groups = validParameter.validFile(parameters, "unique", false);
if (groups == "not found") { groups = ""; }
else {
splitAtDash(groups, Groups);
globaldata->Groups = Groups;
+
+ }
+
+ groups = validParameter.validFile(parameters, "shared", false);
+ if (groups == "not found") { groups = ""; }
+ else {
+ splitAtDash(groups, Groups);
+ globaldata->Groups = Groups;
+ unique = false;
}
fastafile = validParameter.validFile(parameters, "fasta", true);
void GetSharedOTUCommand::help(){
try {
- mothurOut("The get.sharedseqs command parameters are list, group, label, groups, output and fasta. The list and group parameters are required.\n");
+ mothurOut("The get.sharedseqs command parameters are list, group, label, unique, shared, output and fasta. The list and group parameters are required.\n");
mothurOut("The label parameter allows you to select what distance levels you would like output files for, and are separated by dashes.\n");
- mothurOut("The groups parameter allows you to select groups you would like to know the shared info for, and are separated by dashes.\n");
+ mothurOut("The unique and shared parameters allow you to select groups you would like to know the shared info for, and are separated by dashes.\n");
+ mothurOut("If you enter your groups under the unique parameter mothur will return the otus that contain ONLY sequences from those groups.\n");
+ mothurOut("If you enter your groups under the shared parameter mothur will return the otus that contain sequences from those groups and may also contain sequences from other groups.\n");
+ mothurOut("If you do not enter any groups then the get.sharedseqs command will return sequences that are unique to all groups in your group file.\n");
mothurOut("The fasta parameter allows you to input a fasta file and outputs a fasta file for each distance level containing only the sequences that are in OTUs shared by the groups specified.\n");
mothurOut("The output parameter allows you to output the list of names without the group and bin number added. \n");
mothurOut("With this option you can use the names file as an input in get.seqs and remove.seqs commands. To do this enter output=accnos. \n");
if (Groups.size() == 0) {
Groups = groupMap->namesOfGroups;
}
-
+
//put groups in map to find easier
for(int i = 0; i < Groups.size(); i++) {
groupFinder[Groups[i]] = Groups[i];
//go through each bin, find out if shared
for (int i = 0; i < shared->getNumBins(); i++) {
- bool sharedByAll = true;
+ bool uniqueOTU = true;
map<string, int> atLeastOne;
for (int m = 0; m < Groups.size(); m++) {
vector<string> namesOfSeqsInThisBin;
string names = shared->get(i);
- while ((names.find_first_of(',') != -1) && sharedByAll) {
+ while ((names.find_first_of(',') != -1)) {
string name = names.substr(0,names.find_first_of(','));
names = names.substr(names.find_first_of(',')+1, names.length());
-
+
//find group
string seqGroup = groupMap->getGroup(name);
if (output != "accnos") {
namesOfSeqsInThisBin.push_back((name + "\t" + seqGroup + "\t" + toString(i+1)));
}else { namesOfSeqsInThisBin.push_back(name); }
-
+
if (seqGroup == "not found") { mothurOut(name + " is not in your groupfile. Please correct."); mothurOutEndLine(); exit(1); }
//is this seq in one of hte groups we care about
it = groupFinder.find(seqGroup);
- if (it == groupFinder.end()) { sharedByAll = false; } //you have a sequence from a group you don't want
+ if (it == groupFinder.end()) { uniqueOTU = false; } //you have a sequence from a group you don't want
else { atLeastOne[seqGroup]++; }
}
//get last name
- //find group
- if (sharedByAll) {
- string seqGroup = groupMap->getGroup(names);
- if (output != "accnos") {
- namesOfSeqsInThisBin.push_back((names + "\t" + seqGroup + "\t" + toString(i+1)));
- }else { namesOfSeqsInThisBin.push_back(names); }
-
- if (seqGroup == "not found") { mothurOut(names + " is not in your groupfile. Please correct."); mothurOutEndLine(); exit(1); }
+ string seqGroup = groupMap->getGroup(names);
+ if (output != "accnos") {
+ namesOfSeqsInThisBin.push_back((names + "\t" + seqGroup + "\t" + toString(i+1)));
+ }else { namesOfSeqsInThisBin.push_back(names); }
+
+ if (seqGroup == "not found") { mothurOut(names + " is not in your groupfile. Please correct."); mothurOutEndLine(); exit(1); }
+
+ //is this seq in one of hte groups we care about
+ it = groupFinder.find(seqGroup);
+ if (it == groupFinder.end()) { uniqueOTU = false; } //you have a sequence from a group you don't want
+ else { atLeastOne[seqGroup]++; }
- //is this seq in one of hte groups we care about
- it = groupFinder.find(seqGroup);
- if (it == groupFinder.end()) { sharedByAll = false; } //you have a sequence from a group you don't want
- else { atLeastOne[seqGroup]++; }
- }
//make sure you have at least one seq from each group you want
+ bool sharedByAll = true;
map<string, int>::iterator it2;
for (it2 = atLeastOne.begin(); it2 != atLeastOne.end(); it2++) {
if (it2->second == 0) { sharedByAll = false; }
}
- //if shared, save names of seqs in that bin
- if (sharedByAll) {
+ //if the user wants unique bins and this is unique then print
+ //or this the user wants shared bins and this bin is shared then print
+ if ((unique && uniqueOTU && sharedByAll) || (!unique && sharedByAll)) {
wroteSomething = true;
num++;
}
}
}
-
-
-
}
outNames.close();
set<string> labels;
string fastafile, label, groups, listfile, groupfile, output;
- bool abort, allLines;
+ bool abort, allLines, unique;
vector<string> Groups;
map<string, string> groupFinder;
map<string, string>::iterator it;
HCluster::HCluster(RAbundVector* rav, ListVector* lv) : rabund(rav), list(lv){
try {
-
+ mapWanted = false;
numSeqs = list->getNumSeqs();
//initialize cluster array
void HCluster::clusterNames(){
try {
///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
-
+ if (mapWanted) { updateMap(); }
+
list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
list->set(clusterArray[smallRow].smallChild, "");
list->setLabel(toString(smallDist));
}
}
/***********************************************************************/
-bool HCluster::update(int row, int col, float distance){
+void HCluster::update(int row, int col, float distance){
try {
smallRow = row;
}
//printInfo();
- return clustered;
}
catch(exception& e) {
errorOut(e, "HCluster", "update");
exit(1);
}
}
-
-
/***********************************************************************/
+void HCluster::setMapWanted(bool m) {
+ try {
+ mapWanted = m;
+
+ //initialize map
+ for (int i = 0; i < list->getNumBins(); i++) {
+
+ //parse bin
+ string names = list->get(i);
+ while (names.find_first_of(',') != -1) {
+ //get name from bin
+ string name = names.substr(0,names.find_first_of(','));
+ //save name and bin number
+ seq2Bin[name] = i;
+ names = names.substr(names.find_first_of(',')+1, names.length());
+ }
+
+ //get last name
+ seq2Bin[names] = i;
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "setMapWanted");
+ exit(1);
+ }
+}
+/***********************************************************************/
+void HCluster::updateMap() {
+try {
+ //update location of seqs in smallRow since they move to smallCol now
+ string names = list->get(clusterArray[smallRow].smallChild);
+ while (names.find_first_of(',') != -1) {
+ //get name from bin
+ string name = names.substr(0,names.find_first_of(','));
+ //save name and bin number
+ seq2Bin[name] = clusterArray[smallCol].smallChild;
+ names = names.substr(names.find_first_of(',')+1, names.length());
+ }
+
+ //get last name
+ seq2Bin[names] = clusterArray[smallCol].smallChild;
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "updateMap");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
public:
HCluster(RAbundVector*, ListVector*);
~HCluster(){};
- bool update(int, int, float);
- //string getTag();
+ void update(int, int, float);
+ void setMapWanted(bool m);
+ map<string, int> getSeqtoBin() { return seq2Bin; }
protected:
void clusterBins();
int makeActive();
void printInfo();
void updateArrayandLinkTable();
+ void updateMap();
RAbundVector* rabund;
ListVector* list;
map<int, int>::iterator it2;
int numSeqs;
-
int smallRow;
int smallCol;
float smallDist;
+ map<string, int> seq2Bin;
+ bool mapWanted;
};
print_start = true;
start = time(NULL);
-
ifstream in;
openInputFile(distfile, in);
float distance;
cluster = new HCluster(rabund, list);
- bool clusteredSomething;
vector<seqDist> seqs; seqs.resize(1); // to start loop
exitedBreak = false; //lets you know if there is a distance stored in next
//cout << "before cluster update" << endl;
if (seqs[i].seq1 != seqs[i].seq2) {
- clusteredSomething = cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
+ cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
float rndDist = roundDist(seqs[i].dist, precision);
//cout << "after cluster update clusterSomething = " << clusteredSomething << " rndDist = " << rndDist << " rndPreviousDist = " << rndPreviousDist << endl;
}
//**********************************************************************************************************************
-
vector<seqDist> HClusterCommand::getSeqs(ifstream& filehandle){
try {
string firstName, secondName;
//University of Florida, Gainesville, FL 32610-3622 and 3Materials Technology Directorate, Air Force Technical
//Applications Center, 1030 S. Highway A1A, Patrick AFB, FL 32925-3002, USA
//Received January 28, 2009; Revised April 14, 2009; Accepted April 15, 2009
-
-/******************************************************************/
-
-
/************************************************************/
struct seqDist {
int seq1;
vector<double> Libshuff::getMinX(int x){
try{
-
vector<double> minX(groupSizes[x], 0);
for(int i=0;i<groupSizes[x];i++){
minX[i] = (groupSizes[x] > 1 ? (i==0 ? matrix->get(groups[x][0], groups[x][1]) : matrix->get(groups[x][i], groups[x][0])) : 0.0); //get the first value in row i of this block
LibShuffCommand::LibShuffCommand(string option){
try {
-
globaldata = GlobalData::getInstance();
abort = false;
Groups.clear();
userform = validParameter.validFile(parameters, "form", false); if (userform == "not found") { userform = "integral"; }
if (abort == false) {
-
+
matrix = globaldata->gMatrix; //get the distance matrix
setGroups(); //set the groups to be analyzed and sorts them
-
+
/********************************************************************************************/
//this is needed because when we read the matrix we sort it into groups in alphabetical order
//the rest of the command and the classes used in this command assume specific order
/********************************************************************************************/
matrix->setGroups(globaldata->gGroupmap->namesOfGroups);
vector<int> sizes;
- for (int i = 0; i < globaldata->gGroupmap->namesOfGroups.size(); i++) { sizes.push_back(globaldata->gGroupmap->getNumSeqs(globaldata->gGroupmap->namesOfGroups[i])); }
+ for (int i = 0; i < globaldata->gGroupmap->namesOfGroups.size(); i++) { sizes.push_back(globaldata->gGroupmap->getNumSeqs(globaldata->gGroupmap->namesOfGroups[i])); }
matrix->setSizes(sizes);
-
+
if(userform == "discrete"){
form = new DLibshuff(matrix, iters, step, cutOff);
try {
if (abort == true) { return 0; }
-
+
savedDXYValues = form->evaluateAll();
savedMinValues = form->getSavedMins();
-
+
pValueCounts.resize(numGroups);
for(int i=0;i<numGroups;i++){
pValueCounts[i].assign(numGroups, 0);
--- /dev/null
+/*
+ * mgclustercommand.cpp
+ * Mothur
+ *
+ * Created by westcott on 12/11/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mgclustercommand.h"
+#include "readcluster.h"
+
+//**********************************************************************************************************************
+MGClusterCommand::MGClusterCommand(string option){
+ try {
+ globaldata = GlobalData::getInstance();
+ abort = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; }
+
+ else {
+ //valid paramters for this command
+ string Array[] = {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge"};
+ vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+
+ OptionParser parser(option);
+ map<string, string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+
+ //check to make sure all parameters are valid for command
+ for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ //check for required parameters
+ blastfile = validParameter.validFile(parameters, "blast", true);
+ if (blastfile == "not open") { abort = true; }
+ else if (blastfile == "not found") { blastfile = ""; }
+
+ namefile = validParameter.validFile(parameters, "name", true);
+ if (namefile == "not open") { abort = true; }
+ else if (namefile == "not found") { namefile = ""; }
+
+ if ((blastfile == "")) { mothurOut("When executing a mgcluster command you must provide a blastfile."); mothurOutEndLine(); abort = true; }
+
+ //check for optional parameter and set defaults
+ string temp;
+ temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
+ precisionLength = temp.length();
+ convert(temp, precision);
+
+ temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
+ convert(temp, cutoff);
+ cutoff += (5 / (precision * 10.0));
+
+ method = validParameter.validFile(parameters, "method", false);
+ if (method == "not found") { method = "furthest"; }
+
+ if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
+ else { mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
+
+ temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
+ convert(temp, length);
+
+ temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
+ convert(temp, penalty);
+
+ temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
+ minWanted = isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
+ merge = isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
+ hclusterWanted = isTrue(temp);
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "MGClusterCommand", "MGClusterCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+void MGClusterCommand::help(){
+ try {
+ mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
+ mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
+ mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
+ mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
+ mothurOut("The precision parameter's default value is 100. \n");
+ mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
+ mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
+ mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n");
+ mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n");
+ mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n");
+ mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
+ mothurOut("The mgcluster command should be in the following format: \n");
+ mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
+ mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
+ }
+ catch(exception& e) {
+ errorOut(e, "MGClusterCommand", "help");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+MGClusterCommand::~MGClusterCommand(){}
+//**********************************************************************************************************************
+int MGClusterCommand::execute(){
+ try {
+
+ if (abort == true) { return 0; }
+
+ //read names file
+ if (namefile != "") {
+ nameMap = new NameAssignment(namefile);
+ nameMap->readMap();
+ }else{ nameMap= new NameAssignment(); }
+
+ string fileroot = getRootName(blastfile);
+ string tag = "";
+ time_t start;
+ float previousDist = 0.00000;
+ float rndPreviousDist = 0.00000;
+
+ //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
+ //must remember to delete those objects here since readBlast does not
+ read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
+ read->read(nameMap);
+
+ list = new ListVector(nameMap->getListVector());
+ RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
+
+ start = time(NULL);
+ oldList = *list;
+
+ if (!hclusterWanted) {
+ //get distmatrix and overlap
+ SparseMatrix* distMatrix = read->getDistMatrix();
+ overlapMatrix = read->getOverlapMatrix(); //already sorted by read
+ delete read;
+
+ //create cluster
+ if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix); }
+ else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix); }
+ else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix); }
+ tag = cluster->getTag();
+ cluster->setMapWanted(true);
+
+ //open output files
+ openOutputFile(fileroot+ tag + ".list", listFile);
+ openOutputFile(fileroot+ tag + ".rabund", rabundFile);
+ openOutputFile(fileroot+ tag + ".sabund", sabundFile);
+
+ //cluster using cluster classes
+ while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
+
+ cluster->update();
+ float dist = distMatrix->getSmallDist();
+ float rndDist = roundDist(dist, precision);
+
+ if(previousDist <= 0.0000 && dist != previousDist){
+ oldList.setLabel("unique");
+ printData(&oldList);
+ }
+ else if(rndDist != rndPreviousDist){
+ if (merge) {
+ map<string, int> seq2Bin = cluster->getSeqtoBin();
+ ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+ temp->setLabel(toString(rndPreviousDist, precisionLength-1));
+ printData(temp);
+ delete temp;
+ }else{
+ oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
+ printData(&oldList);
+ }
+ }
+
+ previousDist = dist;
+ rndPreviousDist = rndDist;
+ oldList = *list;
+ }
+
+ if(previousDist <= 0.0000){
+ oldList.setLabel("unique");
+ printData(&oldList);
+ }
+ else if(rndPreviousDist<cutoff){
+ if (merge) {
+ map<string, int> seq2Bin = cluster->getSeqtoBin();
+ ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+ temp->setLabel(toString(rndPreviousDist, precisionLength-1));
+ printData(temp);
+ delete temp;
+ }else{
+ oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
+ printData(&oldList);
+ }
+ }
+
+ //free memory
+ overlapMatrix.clear();
+ delete distMatrix;
+ delete cluster;
+
+ }else { //use hcluster to cluster
+ tag = "fn";
+ //get distmatrix and overlap
+ overlapFile = read->getOverlapFile();
+ distFile = read->getDistFile();
+ delete read;
+
+ //sort the distance and overlap files
+ sortHclusterFiles(distFile, overlapFile);
+
+ //create cluster
+ hcluster = new HCluster(rabund, list);
+ hcluster->setMapWanted(true);
+
+ //open output files
+ openOutputFile(fileroot+ tag + ".list", listFile);
+ openOutputFile(fileroot+ tag + ".rabund", rabundFile);
+ openOutputFile(fileroot+ tag + ".sabund", sabundFile);
+
+ vector<DistNode> seqs; seqs.resize(1); // to start loop
+ exitedBreak = false; //lets you know if there is a distance stored in next
+
+ ifstream inHcluster;
+ openInputFile(distFile, inHcluster);
+
+ while (seqs.size() != 0){
+
+ seqs = getSeqs(inHcluster);
+ random_shuffle(seqs.begin(), seqs.end());
+
+ for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
+
+ if (seqs[i].seq1 != seqs[i].seq2) {
+
+ hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
+
+ float rndDist = roundDist(seqs[i].dist, precision);
+
+ if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
+ oldList.setLabel("unique");
+ printData(&oldList);
+ }
+ else if((rndDist != rndPreviousDist)){
+ if (merge) {
+ map<string, int> seq2Bin = hcluster->getSeqtoBin();
+ ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+ temp->setLabel(toString(rndPreviousDist, precisionLength-1));
+ printData(temp);
+ delete temp;
+ }else{
+ oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
+ printData(&oldList);
+ }
+ }
+
+ previousDist = seqs[i].dist;
+ rndPreviousDist = rndDist;
+ oldList = *list;
+ }
+ }
+ }
+ inHcluster.close();
+
+ if(previousDist <= 0.0000){
+ oldList.setLabel("unique");
+ printData(&oldList);
+ }
+ else if(rndPreviousDist<cutoff){
+ if (merge) {
+ map<string, int> seq2Bin = hcluster->getSeqtoBin();
+ ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+ temp->setLabel(toString(rndPreviousDist, precisionLength-1));
+ printData(temp);
+ delete temp;
+ }else{
+ oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
+ printData(&oldList);
+ }
+ }
+
+ delete hcluster;
+ remove(distFile.c_str());
+ remove(overlapFile.c_str());
+ }
+
+ delete list;
+ delete rabund;
+ listFile.close();
+ sabundFile.close();
+ rabundFile.close();
+
+ globaldata->setListFile(fileroot+ tag + ".list");
+ globaldata->setFormat("list");
+
+ mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ errorOut(e, "MGClusterCommand", "execute");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+void MGClusterCommand::printData(ListVector* mergedList){
+ try {
+ mergedList->print(listFile);
+ mergedList->getRAbundVector().print(rabundFile);
+
+ SAbundVector sabund = mergedList->getSAbundVector();
+
+ sabund.print(cout);
+ sabund.print(sabundFile);
+ }
+ catch(exception& e) {
+ errorOut(e, "MGClusterCommand", "printData");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+//this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
+//that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
+ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
+ try {
+ //create new listvector so you don't overwrite the clustering
+ ListVector* newList = new ListVector(oldList);
+ bool done = false;
+ ifstream inOverlap;
+ int count = 0;
+
+ if (hclusterWanted) {
+ openInputFile(overlapFile, inOverlap);
+ if (inOverlap.eof()) { done = true; }
+ }else { if (overlapMatrix.size() == 0) { done = true; } }
+
+ while (!done) {
+
+ //get next overlap
+ DistNode overlapNode;
+ if (!hclusterWanted) {
+ if (count < overlapMatrix.size()) { //do we have another node in the matrix
+ overlapNode = overlapMatrix[count];
+ count++;
+ }else { break; }
+ }else {
+ if (!inOverlap.eof()) {
+ string firstName, secondName;
+ float overlapDistance;
+ inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
+
+ map<string,int>::iterator itA = nameMap->find(firstName);
+ map<string,int>::iterator itB = nameMap->find(secondName);
+ if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
+ if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
+
+ overlapNode.seq1 = itA->second;
+ overlapNode.seq2 = itB->second;
+ overlapNode.dist = overlapDistance;
+ }else { inOverlap.close(); break; }
+ }
+
+ if (overlapNode.dist < dist) {
+ //get names of seqs that overlap
+ string name1 = nameMap->get(overlapNode.seq1);
+ string name2 = nameMap->get(overlapNode.seq2);
+
+ //use binInfo to find out if they are already in the same bin
+ int binKeep = binInfo[name1];
+ int binRemove = binInfo[name2];
+
+ //if not merge bins and update binInfo
+ if(binKeep != binRemove) {
+ //save names in old bin
+ string names = list->get(binRemove);
+
+ //merge bins into name1s bin
+ newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
+ newList->set(binRemove, "");
+
+ //update binInfo
+ while (names.find_first_of(',') != -1) {
+ //get name from bin
+ string name = names.substr(0,names.find_first_of(','));
+ //save name and bin number
+ binInfo[name] = binKeep;
+ names = names.substr(names.find_first_of(',')+1, names.length());
+ }
+
+ //get last name
+ binInfo[names] = binKeep;
+ }
+
+ }else { done = true; }
+ }
+
+ //return listvector
+ return newList;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "MGClusterCommand", "mergeOPFs");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
+ try {
+ //sort distFile
+ ReadCluster* readCluster = new ReadCluster(unsortedDist, cutoff);
+ readCluster->setFormat("column");
+ readCluster->read(nameMap);
+ string sortedDistFile = readCluster->getOutputFile();
+ ListVector* extralist = readCluster->getListVector(); //we get this so we can delete it and not have a memory leak
+
+ remove(unsortedDist.c_str()); //delete unsorted file
+ distFile = sortedDistFile;
+ delete extralist;
+ delete readCluster;
+
+ //sort overlap file
+ readCluster = new ReadCluster(unsortedOverlap, cutoff);
+ readCluster->setFormat("column");
+ readCluster->read(nameMap);
+ string sortedOverlapFile = readCluster->getOutputFile();
+ extralist = readCluster->getListVector(); //we get this so we can delete it and not have a memory leak
+
+ remove(unsortedOverlap.c_str()); //delete unsorted file
+ overlapFile = sortedOverlapFile;
+ delete extralist;
+ delete readCluster;
+ }
+ catch(exception& e) {
+ errorOut(e, "MGClusterCommand", "sortHclusterFiles");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<DistNode> MGClusterCommand::getSeqs(ifstream& filehandle){
+ try {
+ string firstName, secondName;
+ float distance, prevDistance;
+ vector<DistNode> sameSeqs;
+ prevDistance = -1;
+
+ //if you are not at the beginning of the file
+ if (exitedBreak) {
+ sameSeqs.push_back(next);
+ prevDistance = next.dist;
+ exitedBreak = false;
+ }
+
+ //get entry
+ while (!filehandle.eof()) {
+
+ filehandle >> firstName >> secondName >> distance; gobble(filehandle);
+
+ //save first one
+ if (prevDistance == -1) { prevDistance = distance; }
+
+ map<string,int>::iterator itA = nameMap->find(firstName);
+ map<string,int>::iterator itB = nameMap->find(secondName);
+ if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
+ if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
+
+ //using cutoff
+ if (distance > cutoff) { break; }
+
+ if (distance != -1) { //-1 means skip me
+
+ //are the distances the same
+ if (distance == prevDistance) { //save in vector
+ DistNode temp(itA->second, itB->second, distance);
+ sameSeqs.push_back(temp);
+ exitedBreak = false;
+ }else{
+ next.seq1 = itA->second;
+ next.seq2 = itB->second;
+ next.dist = distance;
+ exitedBreak = true;
+ break;
+ }
+ }
+ }
+
+ return sameSeqs;
+ }
+ catch(exception& e) {
+ errorOut(e, "MGClusterCommand", "getSeqs");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+
+
+
+
--- /dev/null
+#ifndef MGCLUSTERCOMMAND_H
+#define MGCLUSTERCOMMAND_H
+
+/*
+ * mgclustercommand.h
+ * Mothur
+ *
+ * Created by westcott on 12/11/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "command.hpp"
+#include "readblast.h"
+#include "sparsematrix.hpp"
+#include "nameassignment.hpp"
+#include "globaldata.hpp"
+#include "cluster.hpp"
+#include "hcluster.h"
+
+/**********************************************************************/
+
+class MGClusterCommand : public Command {
+
+public:
+ MGClusterCommand(string);
+ ~MGClusterCommand();
+ int execute();
+ void help();
+
+private:
+ GlobalData* globaldata;
+ ReadBlast* read;
+ NameAssignment* nameMap;
+ Cluster* cluster;
+ HCluster* hcluster;
+ ListVector* list;
+ ListVector oldList;
+ vector<DistNode> overlapMatrix;
+ DistNode next;
+
+
+ string blastfile, method, namefile, overlapFile, distFile;
+ ofstream sabundFile, rabundFile, listFile;
+ float cutoff, penalty;
+ int precision, length, precisionLength;
+ bool abort, minWanted, hclusterWanted, exitedBreak, merge;
+
+ void printData(ListVector*);
+ ListVector* mergeOPFs(map<string, int>, float);
+ void sortHclusterFiles(string, string);
+ vector<DistNode> getSeqs(ifstream&);
+
+};
+
+/**********************************************************************/
+
+#endif
+
+
+
// data[firstCol] = secondCol; //store data in map
list.push_back(secondCol); //adds data's value to list
+ reverse[rowIndex] = firstCol;
(*this)[firstCol] = rowIndex++;
gobble(fileHandle);
}
int num = (*this).size();
(*this)[name] = num;
+ reverse[num] = name;
list.push_back(name);
}
//**********************************************************************************************************************
-void NameAssignment::print(void){
+void NameAssignment::print(ostream& out){
try {
map<string,int>::iterator it;
+cout << (*this).size() << endl;
for(it = (*this).begin(); it!=(*this).end(); it++){
- mothurOut(it->first + "\t" + toString(it->second)); mothurOutEndLine(); //prints out keys and values of the map this.
+ out << it->first << '\t' << it->second << endl; //prints out keys and values of the map this.
}
}
catch(exception& e) {
return (*this)[key];
}
+//**********************************************************************************************************************
+
+string NameAssignment::get(int key){
+
+ return reverse[key];
+}
//**********************************************************************************************************************
void readMap();
ListVector getListVector();
int get(string);
- void print();
+ string get(int);
+ void print(ostream&);
void push_back(string);
private:
ifstream fileHandle;
ListVector list;
+ map<int, string> reverse;
};
--- /dev/null
+/*
+ * readblast.cpp
+ * Mothur
+ *
+ * Created by westcott on 12/10/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "readblast.h"
+#include "progress.hpp"
+
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareOverlap(DistNode left, DistNode right){
+ return (left.dist < right.dist);
+}
+/*********************************************************************************************/
+ReadBlast::ReadBlast(string file, float c, float p, int l, bool m, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(m), hclusterWanted(h) {
+ try {
+ matrix = NULL;
+ }
+ catch(exception& e) {
+ errorOut(e, "ReadBlast", "ReadBlast");
+ exit(1);
+ }
+}
+/*********************************************************************************************/
+//assumptions about the blast file:
+//1. if duplicate lines occur the first line is always best and is chosen
+//2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
+void ReadBlast::read(NameAssignment* nameMap) {
+ try {
+
+ //if the user has not given a names file read names from blastfile
+ if (nameMap->size() == 0) { readNames(nameMap); }
+ int nseqs = nameMap->size();
+
+ ifstream fileHandle;
+ openInputFile(blastfile, fileHandle);
+
+ string firstName, secondName, eScore, currentRow;
+ string repeatName = "";
+ int count = 1;
+ float distance, thisoverlap, refScore;
+ float percentId;
+ float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
+
+ ofstream outDist;
+ ofstream outOverlap;
+
+ //create objects needed for read
+ if (!hclusterWanted) {
+ matrix = new SparseMatrix();
+ }else{
+ overlapFile = getRootName(blastfile) + "overlap.dist";
+ distFile = getRootName(blastfile) + "hclusterDists.dist";
+
+ openOutputFile(overlapFile, outOverlap);
+ openOutputFile(distFile, outDist);
+ }
+
+ Progress* reading = new Progress("Reading blast: ", nseqs * nseqs);
+
+ //this is used to quickly find if we already have a distance for this combo
+ vector< map<int,float> > dists; dists.resize(nseqs); //dists[0][1] = distance from seq0 to seq1
+ map<int, float> thisRowsBlastScores;
+
+ if (!fileHandle.eof()) {
+ //read in line from file
+ fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
+ gobble(fileHandle);
+
+ currentRow = firstName;
+ lengthThisSeq = numBases;
+ repeatName = firstName + secondName;
+
+ if (firstName == secondName) { refScore = score; }
+ else{
+ //convert name to number
+ map<string,int>::iterator itA = nameMap->find(firstName);
+ map<string,int>::iterator itB = nameMap->find(secondName);
+ if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
+ if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
+
+ thisRowsBlastScores[itB->second] = score;
+
+ //calc overlap score
+ thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
+
+ //if there is a valid overlap, add it
+ if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
+ if (!hclusterWanted) {
+ DistNode overlapValue(itA->second, itB->second, thisoverlap);
+ overlap.push_back(overlapValue);
+ }else {
+ outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
+ }
+ }
+ }
+ }else { mothurOut("Error in your blast file, cannot read."); mothurOutEndLine(); exit(1); }
+
+
+ //read file
+ while(!fileHandle.eof()){
+
+ //read in line from file
+ fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
+ //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;
+ gobble(fileHandle);
+
+ string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
+
+ //if this is a new pairing
+ if (temp != repeatName) {
+ repeatName = temp;
+
+ if (currentRow == firstName) {
+ //cout << "first = " << firstName << " second = " << secondName << endl;
+ if (firstName == secondName) {
+ refScore = score;
+ reading->update((count + nseqs));
+ count++;
+ }else{
+ //convert name to number
+ map<string,int>::iterator itA = nameMap->find(firstName);
+ map<string,int>::iterator itB = nameMap->find(secondName);
+ if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
+ if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
+
+ //save score
+ thisRowsBlastScores[itB->second] = score;
+
+ //calc overlap score
+ thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
+
+ //if there is a valid overlap, add it
+ if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
+ if (!hclusterWanted) {
+ DistNode overlapValue(itA->second, itB->second, thisoverlap);
+ //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
+ overlap.push_back(overlapValue);
+ }else {
+ outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
+ }
+ }
+ } //end else
+ }else { //end row
+ //convert blast scores to distance and add cell to sparse matrix if we can
+ map<int, float>::iterator it;
+ map<int, float>::iterator itDist;
+ for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
+ distance = 1.0 - (it->second / refScore);
+
+ //do we already have the distance calculated for b->a
+ map<string,int>::iterator itA = nameMap->find(currentRow);
+ itDist = dists[it->first].find(itA->second);
+
+ //if we have it then compare
+ if (itDist != dists[it->first].end()) {
+ //if you want the minimum blast score ratio, then pick max distance
+ if(minWanted) { distance = max(itDist->second, distance); }
+ else{ distance = min(itDist->second, distance); }
+
+ //is this distance below cutoff
+ if (distance < cutoff) {
+ if (!hclusterWanted) {
+ PCell value(itA->second, it->first, distance);
+ matrix->addCell(value);
+ }else{
+ outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
+ }
+ }
+ //not going to need this again
+ dists[it->first].erase(itDist);
+ }else { //save this value until we get the other ratio
+ dists[itA->second][it->first] = distance;
+ }
+ }
+ //clear out last rows info
+ thisRowsBlastScores.clear();
+
+ currentRow = firstName;
+ lengthThisSeq = numBases;
+
+ //add this row to thisRowsBlastScores
+ if (firstName == secondName) { refScore = score; }
+ else{ //add this row to thisRowsBlastScores
+
+ //convert name to number
+ map<string,int>::iterator itA = nameMap->find(firstName);
+ map<string,int>::iterator itB = nameMap->find(secondName);
+ if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
+ if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
+
+ thisRowsBlastScores[itB->second] = score;
+
+ //calc overlap score
+ thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
+
+ //if there is a valid overlap, add it
+ if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
+ if (!hclusterWanted) {
+ DistNode overlapValue(itA->second, itB->second, thisoverlap);
+ overlap.push_back(overlapValue);
+ }else {
+ outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
+ }
+ }
+ }
+ }//end if current row
+ }//end if repeat
+ }//end while
+
+ //get last rows info stored
+ //convert blast scores to distance and add cell to sparse matrix if we can
+ map<int, float>::iterator it;
+ map<int, float>::iterator itDist;
+ for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
+ distance = 1.0 - (it->second / refScore);
+
+ //do we already have the distance calculated for b->a
+ map<string,int>::iterator itA = nameMap->find(currentRow);
+ itDist = dists[it->first].find(itA->second);
+
+ //if we have it then compare
+ if (itDist != dists[it->first].end()) {
+ //if you want the minimum blast score ratio, then pick max distance
+ if(minWanted) { distance = max(itDist->second, distance); }
+ else{ distance = min(itDist->second, distance); }
+
+ //is this distance below cutoff
+ if (distance < cutoff) {
+ if (!hclusterWanted) {
+ PCell value(itA->second, it->first, distance);
+ matrix->addCell(value);
+ }else{
+ outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
+ }
+ }
+ //not going to need this again
+ dists[it->first].erase(itDist);
+ }else { //save this value until we get the other ratio
+ dists[itA->second][it->first] = distance;
+ }
+ }
+ //clear out info
+ thisRowsBlastScores.clear();
+ dists.clear();
+
+ if (!hclusterWanted) {
+ sort(overlap.begin(), overlap.end(), compareOverlap);
+ }else {
+ outDist.close();
+ outOverlap.close();
+ }
+
+ reading->finish();
+ delete reading;
+ fileHandle.close();
+ }
+ catch(exception& e) {
+ errorOut(e, "ReadBlast", "read");
+ exit(1);
+ }
+}
+/*********************************************************************************************/
+void ReadBlast::readNames(NameAssignment* nameMap) {
+ try {
+ mothurOut("Reading names... "); cout.flush();
+
+ string name, hold, prevName;
+ int num = 1;
+
+ ifstream in;
+ openInputFile(blastfile, in);
+
+ //read first line
+ in >> prevName;
+ for (int i = 0; i < 11; i++) { in >> hold; }
+ gobble(in);
+
+ //save name in nameMap
+ nameMap->push_back(prevName);
+
+ while (!in.eof()) {
+
+ //read line
+ in >> name;
+ for (int i = 0; i < 11; i++) { in >> hold; }
+ gobble(in);
+
+ //is this a new name?
+ if (name != prevName) {
+ prevName = name;
+ nameMap->push_back(name);
+ num++;
+ }
+ }
+
+ in.close();
+
+ //write out names file
+ //string outNames = getRootName(blastfile) + "names";
+ //ofstream out;
+ //openOutputFile(outNames, out);
+ //nameMap->print(out);
+ //out.close();
+
+ mothurOut(toString(num) + " names read."); mothurOutEndLine();
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ReadBlast", "readNames");
+ exit(1);
+ }
+}
+/*********************************************************************************************/
+
+
+
--- /dev/null
+#ifndef READBLAST_H
+#define READBLAST_H
+/*
+ * readblast.h
+ * Mothur
+ *
+ * Created by westcott on 12/10/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "sparsematrix.hpp"
+#include "nameassignment.hpp"
+
+/****************************************************************************************/
+struct DistNode {
+ int seq1;
+ int seq2;
+ float dist;
+ DistNode(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {}
+ DistNode() {}
+ ~DistNode() {}
+};
+/****************************************************************************************/
+
+//Note: this class creates a sparsematrix and list if the read is executed, but does not delete them on deconstruction.
+//the user of this object is responsible for deleting the matrix and list if they call the read or there will be a memory leak
+//it is done this way so the read can be deleted and the information still used.
+
+class ReadBlast {
+
+public:
+ ReadBlast(string, float, float, int, bool, bool); //blastfile, cutoff, penalty, length of overlap, min or max bsr, hclusterWanted
+ ~ReadBlast() {}
+
+ void read(NameAssignment*);
+ SparseMatrix* getDistMatrix() { return matrix; }
+ vector<DistNode> getOverlapMatrix() { return overlap; }
+ string getOverlapFile() { return overlapFile; }
+ string getDistFile() { return distFile; }
+
+private:
+ string blastfile, overlapFile, distFile;
+ int length; //number of amino acids overlapped
+ float penalty, cutoff; //penalty is used to adjust error rate
+ bool minWanted; //if true choose min bsr, if false choose max bsr
+ bool hclusterWanted;
+
+ SparseMatrix* matrix;
+ vector<DistNode> overlap;
+
+ void readNames(NameAssignment*);
+};
+
+/*******************************************************************************************/
+
+#endif
+
map<string,int>::iterator itB = nameMap->find(secondName);\r
\r
if(itA == nameMap->end()){\r
- cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";\r
+ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);\r
}\r
if(itB == nameMap->end()){\r
- cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n";\r
+ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);\r
}\r
\r
if (distance == -1) { distance = 1000000; }\r
openInputFile(distFileName, in);
matrix = new FullMatrix(in); //reads the matrix file
in.close();
+
+ //if files don't match...
+ if (matrix->getNumSeqs() < groupMap->getNumSeqs()) {
+ mothurOut("Your distance file contains " + toString(matrix->getNumSeqs()) + " sequences, and your group file contains " + toString(groupMap->getNumSeqs()) + " sequences."); mothurOutEndLine();
+ //create new group file
+ string newGroupFile = getRootName(groupfile) + "editted.groups";
+ ofstream outGroups;
+ openOutputFile(newGroupFile, outGroups);
+
+ for (int i = 0; i < matrix->getNumSeqs(); i++) {
+ Names temp = matrix->getRowInfo(i);
+ outGroups << temp.seqName << '\t' << temp.groupName << endl;
+ }
+ outGroups.close();
+
+ mothurOut(newGroupFile + " is a new group file containing only the sequence that are in your distance file. I will read this file instead."); mothurOutEndLine();
+
+ //read new groupfile
+ delete groupMap; groupMap = NULL;
+ groupfile = newGroupFile;
+ globaldata->setGroupFile(groupfile);
+
+ groupMap = new GroupMap(groupfile);
+ groupMap->readMap();
+
+ globaldata->gGroupmap = groupMap;
+ }
+
//memory leak prevention
if (globaldata->gMatrix != NULL) { delete globaldata->gMatrix; }
globaldata->gMatrix = matrix; //save matrix for coverage commands
vector<vector<double> > SLibshuff::evaluateAll(){
try{
savedMins.resize(numGroups);
-
vector<vector<double> > dCXYValues(numGroups);
for(int i=0;i<numGroups;i++){
}
if(minLength > 0 || maxLength > 0){
success = cullLength(currSeq);
- if ((currSeq.getUnaligned().length() > 300) && (success)) { cout << "too long " << currSeq.getUnaligned().length() << endl; }
if(!success){ trashCode += 'l'; }
}
if(maxHomoP > 0){