A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; };
A70DECD91063D8B40057C03C /* secondarystructurecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70DECD81063D8B40057C03C /* secondarystructurecommand.cpp */; };
A7283FF81056CAE100D0CC69 /* chimeracheckrdp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */; };
+ A729ACD010848E6100139801 /* hclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A729ACCF10848E6100139801 /* hclustercommand.cpp */; };
+ A729ACE410849BBE00139801 /* hcluster.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A729ACE310849BBE00139801 /* hcluster.cpp */; };
A73329CF1083B3B3003B10C5 /* getlistcountcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73329CE1083B3B3003B10C5 /* getlistcountcommand.cpp */; };
+ A751ACEA1098B283003D0911 /* readcluster.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A751ACE91098B283003D0911 /* readcluster.cpp */; };
A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; };
A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; };
A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; };
A70DECD81063D8B40057C03C /* secondarystructurecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = secondarystructurecommand.cpp; sourceTree = SOURCE_ROOT; };
A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckrdp.h; sourceTree = SOURCE_ROOT; };
A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckrdp.cpp; sourceTree = SOURCE_ROOT; };
- A73329CD1083B3B3003B10C5 /* getlistcountcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlistcountcommand.h; sourceTree = "<group>"; };
- A73329CE1083B3B3003B10C5 /* getlistcountcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlistcountcommand.cpp; sourceTree = "<group>"; };
+ A729ACCE10848E6100139801 /* hclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hclustercommand.h; sourceTree = SOURCE_ROOT; };
+ A729ACCF10848E6100139801 /* hclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hclustercommand.cpp; sourceTree = SOURCE_ROOT; };
+ A729ACE210849BBE00139801 /* hcluster.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hcluster.h; sourceTree = SOURCE_ROOT; };
+ A729ACE310849BBE00139801 /* hcluster.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hcluster.cpp; sourceTree = SOURCE_ROOT; };
+ A73329CD1083B3B3003B10C5 /* getlistcountcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlistcountcommand.h; sourceTree = SOURCE_ROOT; };
+ A73329CE1083B3B3003B10C5 /* getlistcountcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlistcountcommand.cpp; sourceTree = SOURCE_ROOT; };
+ A751ACE81098B283003D0911 /* readcluster.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readcluster.h; sourceTree = SOURCE_ROOT; };
+ A751ACE91098B283003D0911 /* readcluster.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readcluster.cpp; sourceTree = SOURCE_ROOT; };
A75B887B104C16860083C454 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = SOURCE_ROOT; };
A75B887C104C16860083C454 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = SOURCE_ROOT; };
A7B04491106CC3E60046FC83 /* chimeraslayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayer.h; sourceTree = SOURCE_ROOT; };
374F2FE81006346600E97C66 /* chimera */,
37D927C20F21331F001D4494 /* cluster.hpp */,
37D927C10F21331F001D4494 /* cluster.cpp */,
+ A729ACE210849BBE00139801 /* hcluster.h */,
+ A729ACE310849BBE00139801 /* hcluster.cpp */,
37D928A90F2133E5001D4494 /* commands */,
37D927C60F21331F001D4494 /* collect.h */,
37D927C50F21331F001D4494 /* collect.cpp */,
7E412FE90F8D3E2C00381DD0 /* libshuff.cpp */,
37E3ED800F4D9D0D00B5DF02 /* mothur.h */,
37D927EF0F21331F001D4494 /* mothur.cpp */,
- 37D927F10F21331F001D4494 /* nameassignment.hpp */,
- 37D927F00F21331F001D4494 /* nameassignment.cpp */,
373C691E0FC1C98600137ACD /* nast.hpp */,
373C691D0FC1C98600137ACD /* nast.cpp */,
373C692A0FC1C9EB00137ACD /* nastreport.hpp */,
3796441D0FB9B9650081FDB6 /* read */ = {
isa = PBXGroup;
children = (
+ A751ACE81098B283003D0911 /* readcluster.h */,
+ A751ACE91098B283003D0911 /* readcluster.cpp */,
375AA1340F9E433D008EF9B8 /* readcolumn.h */,
375AA1330F9E433D008EF9B8 /* readcolumn.cpp */,
37D928130F21331F001D4494 /* readmatrix.hpp */,
37B73CA51004D89A008C4B41 /* getseqscommand.cpp */,
A7E4A781106913F900688F62 /* getsharedotucommand.h */,
A7E4A782106913F900688F62 /* getsharedotucommand.cpp */,
+ A729ACCE10848E6100139801 /* hclustercommand.h */,
+ A729ACCF10848E6100139801 /* hclustercommand.cpp */,
375873F10F7D64800040F377 /* heatmapcommand.h */,
375873F00F7D64800040F377 /* heatmapcommand.cpp */,
378598640FDD497000EF9D03 /* heatmapsimcommand.h */,
37D927EB0F21331F001D4494 /* kmerdb.cpp */,
37D927EE0F21331F001D4494 /* listvector.hpp */,
37D927ED0F21331F001D4494 /* listvector.cpp */,
+ 37D927F10F21331F001D4494 /* nameassignment.hpp */,
+ 37D927F00F21331F001D4494 /* nameassignment.cpp */,
37D927F80F21331F001D4494 /* ordervector.hpp */,
37D927F70F21331F001D4494 /* ordervector.cpp */,
37D928000F21331F001D4494 /* rabundvector.hpp */,
A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */,
A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */,
A73329CF1083B3B3003B10C5 /* getlistcountcommand.cpp in Sources */,
+ A729ACD010848E6100139801 /* hclustercommand.cpp in Sources */,
+ A729ACE410849BBE00139801 /* hcluster.cpp in Sources */,
+ A751ACEA1098B283003D0911 /* readcluster.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
}
if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = list->getLabel();
+
delete list;
list = input->getListVector(lastLabel);
processedLabels.insert(list->getLabel());
userLabels.erase(list->getLabel());
+
+ //restore real lastlabel to save below
+ list->setLabel(saveLabel);
}
lastLabel = list->getLabel();
//you have a label the user want that is smaller than this label and the last label has not already been processed
if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = order->getLabel();
delete order;
order = input->getSharedOrderVector(lastLabel);
processedLabels.insert(order->getLabel());
userLabels.erase(order->getLabel());
+
+ //restore real lastlabel to save below
+ order->setLabel(saveLabel);
}
if (abort == false) {
- //get matrix, list and rabund for execute
+
+ //get matrix, list and rabund for execute
if(globaldata->gSparseMatrix != NULL) { matrix = globaldata->gSparseMatrix; }
if(globaldata->gListVector != NULL){
}
//you have a label the user want that is smaller than this label and the last label has not already been processed
if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = order->getLabel();
delete order;
order = (input->getOrderVector(lastLabel));
mothurOut(order->getLabel()); mothurOutEndLine();
processedLabels.insert(order->getLabel());
userLabels.erase(order->getLabel());
+
+ //restore real lastlabel to save below
+ order->setLabel(saveLabel);
}
lastLabel = order->getLabel();
//you have a label the user want that is smaller than this label and the last label has not already been processed
if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = order->getLabel();
+
delete order;
order = input->getSharedOrderVector(lastLabel);
mothurOut(order->getLabel()); mothurOutEndLine();
processedLabels.insert(order->getLabel());
userLabels.erase(order->getLabel());
+
+ //restore real lastlabel to save below
+ order->setLabel(saveLabel);
}
#include "secondarystructurecommand.h"
#include "getsharedotucommand.h"
#include "getlistcountcommand.h"
+#include "hclustercommand.h"
/***********************************************************/
commands["get.sharedotu"] = "get.sharedotu";
commands["get.listcount"] = "get.listcount";
commands["quit"] = "quit";
+
+ commands["hcluster"] = "hcluster";
}
/***********************************************************/
else if(commandName == "align.check") { command = new AlignCheckCommand(optionString); }
else if(commandName == "get.sharedotu") { command = new GetSharedOTUCommand(optionString); }
else if(commandName == "get.listcount") { command = new GetListCountCommand(optionString); }
+
+ else if(commandName == "hcluster") { command = new HClusterCommand(optionString); }
else { command = new NoCommand(optionString); }
return command;
}
if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = list->getLabel();
+
delete list;
list = input->getListVector(lastLabel);
processedLabels.insert(list->getLabel());
userLabels.erase(list->getLabel());
+
+ //restore real lastlabel to save below
+ list->setLabel(saveLabel);
}
lastLabel = list->getLabel();
}
if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = list->getLabel();
+
delete list;
list = input->getListVector(lastLabel);
mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
processedLabels.insert(list->getLabel());
userLabels.erase(list->getLabel());
+
+ //restore real lastlabel to save below
+ list->setLabel(saveLabel);
}
lastLabel = list->getLabel();
}
if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = list->getLabel();
+
delete list;
list = input->getListVector(lastLabel);
processedLabels.insert(list->getLabel());
userLabels.erase(list->getLabel());
+
+ //restore real lastlabel to save below
+ list->setLabel(saveLabel);
}
lastLabel = list->getLabel();
}
if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = order->getLabel();
+
delete order;
order = (input->getOrderVector(lastLabel));
processedLabels.insert(order->getLabel());
userLabels.erase(order->getLabel());
+
+ //restore real lastlabel to save below
+ order->setLabel(saveLabel);
}
mothurOut("With this option you can use the names file as an input in get.seqs and remove.seqs commands. To do this enter output=accnos. \n");
mothurOut("The get.sharedotu command outputs a .names file for each distance level containing a list of sequences in the OTUs shared by the groups specified.\n");
mothurOut("The get.sharedotu command should be in the following format: get.sabund(label=yourLabels, groups=yourGroups, fasta=yourFastafile, output=yourOutput).\n");
- mothurOut("Example get.sharedotu(label=unique-0.01, group=forest-pasture, fasta=amazon.fasta, output=accnos).\n");
+ mothurOut("Example get.sharedotu(list=amazon.fn.list, label=unique-0.01, group=forest-pasture, fasta=amazon.fasta, output=accnos).\n");
mothurOut("The default value for label is all labels in your inputfile. The default for groups is all groups in your file.\n");
mothurOut("Note: No spaces between parameter labels (i.e. label), '=' and parameters (i.e.yourLabel).\n\n");
}
}
if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = list->getLabel();
mothurOut(lastlist->getLabel());
process(lastlist);
processedLabels.insert(lastlist->getLabel());
userLabels.erase(lastlist->getLabel());
+
+ //restore real lastlabel to save below
+ list->setLabel(saveLabel);
}
lastLabel = list->getLabel();
string GlobalData::getGroupFile() { return groupfile; }
string GlobalData::getOrderFile() { return orderfile; }
string GlobalData::getTreeFile() { return treefile; }
-string GlobalData::getSharedFile() { return sharedfile; }
-//string GlobalData::getFastaFile() { return fastafile; }
+string GlobalData::getSharedFile() { return sharedfile; }
string GlobalData::getFormat() { return format; }
void GlobalData::setListFile(string file) { listfile = file; inputFileName = file; }
-//void GlobalData::setFastaFile(string file) { fastafile = file; inputFileName = file; }
void GlobalData::setTreeFile(string file) { treefile = file; inputFileName = file; }
void GlobalData::setRabundFile(string file) { rabundfile = file; inputFileName = file; }
void GlobalData::setSabundFile(string file) { sabundfile = file; inputFileName = file; }
gMatrix = NULL;
gTreemap = NULL;
gSequenceDB = NULL;
+ nameMap = NULL;
}
/*******************************************************/
if (gTreemap != NULL) { delete gTreemap; gTreemap = NULL; }
if (gSequenceDB != NULL) { delete gSequenceDB; gSequenceDB = NULL;}
+
+ if (nameMap != NULL) { delete nameMap; nameMap = NULL; }
gTree.clear();
if (gMatrix != NULL) { delete gMatrix; gMatrix = NULL;}
if (gTreemap != NULL) { delete gTreemap; gTreemap = NULL; }
if (gSequenceDB != NULL) { delete gSequenceDB; gSequenceDB = NULL;}
+ if (nameMap != NULL) { delete nameMap; nameMap = NULL; }
}
catch(exception& e) {
errorOut(e, "GlobalData", "~GlobalData");
#include "tree.h"
#include "sparsematrix.hpp"
#include "sequencedb.h"
+#include "nameassignment.hpp"
class ListVector;
void setOrderFile(string file);
void setFormat(string); //do we need this?
+ NameAssignment* nameMap;
+
void clear();
void clearLabels();
void clearAbund();
private:
- string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, treefile, sharedfile, format;
+ string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, treefile, sharedfile, format, distfile;
static GlobalData* _uniqueInstance;
GlobalData( const GlobalData& ); // Disable copy constructor
--- /dev/null
+/*
+ * hcluster.cpp
+ * Mothur
+ *
+ * Created by westcott on 10/13/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "hcluster.h"
+#include "rabundvector.hpp"
+#include "listvector.hpp"
+#include "sparsematrix.hpp"
+
+/***********************************************************************/
+
+HCluster::HCluster(RAbundVector* rav, ListVector* lv) : rabund(rav), list(lv){
+ try {
+
+ numSeqs = list->getNumSeqs();
+
+ //initialize cluster array
+ for (int i = 0; i < numSeqs; i++) {
+ clusterNode temp(1, -1, i);
+ clusterArray.push_back(temp);
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "HCluster");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
+void HCluster::clusterBins(){
+ try {
+ //cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild);
+
+ rabund->set(clusterArray[smallCol].smallChild, rabund->get(clusterArray[smallRow].smallChild)+rabund->get(clusterArray[smallCol].smallChild));
+ rabund->set(clusterArray[smallRow].smallChild, 0);
+ rabund->setLabel(toString(smallDist));
+
+ //cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "clusterBins");
+ exit(1);
+ }
+
+
+}
+
+/***********************************************************************/
+
+void HCluster::clusterNames(){
+ try {
+ ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
+
+ list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
+ list->set(clusterArray[smallRow].smallChild, "");
+ list->setLabel(toString(smallDist));
+
+ //cout << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild) << endl;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "clusterNames");
+ exit(1);
+ }
+
+}
+/***********************************************************************/
+int HCluster::getUpmostParent(int node){
+ try {
+
+ while (clusterArray[node].parent != -1) {
+ node = clusterArray[node].parent;
+ }
+
+ return node;
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "getUpmostParent");
+ exit(1);
+ }
+}
+/***********************************************************************/
+void HCluster::printInfo(){
+ try {
+
+ cout << "link table" << endl;
+ for (it = activeLinks.begin(); it!= activeLinks.end(); it++) {
+ cout << it->first << " = " << it->second << endl;
+ }
+ cout << endl;
+ for (int i = 0; i < linkTable.size(); i++) {
+ for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
+ cout << it->first << '\t' << it->second << '\t' << '\t';
+ }
+ cout << endl;
+ }
+ cout << endl << "clusterArray" << endl;
+
+ for (int i = 0; i < clusterArray.size(); i++) {
+ cout << i << '\t' << clusterArray[i].numSeq << '\t' << clusterArray[i].parent << '\t' << clusterArray[i].smallChild << endl;
+ }
+ cout << endl;
+
+
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "getUpmostParent");
+ exit(1);
+ }
+}
+/***********************************************************************/
+int HCluster::makeActive() {
+ try {
+
+ int linkValue = 1;
+//cout << "active - here" << endl;
+ it = activeLinks.find(smallRow);
+ it2 = activeLinks.find(smallCol);
+
+ if ((it == activeLinks.end()) && (it2 == activeLinks.end())) { //both are not active so add them
+ int size = linkTable.size();
+ map<int, int> temp; map<int, int> temp2;
+
+ //add link to eachother
+ temp[smallRow] = 1; // 1 2
+ temp2[smallCol] = 1; // 1 0 1
+ // 2 1 0
+ linkTable.push_back(temp);
+ linkTable.push_back(temp2);
+
+ //add to activeLinks
+ activeLinks[smallRow] = size;
+ activeLinks[smallCol] = size+1;
+//cout << "active - here1" << endl;
+ }else if ((it != activeLinks.end()) && (it2 == activeLinks.end())) { //smallRow is active, smallCol is not
+ int size = linkTable.size();
+ int alreadyActiveRow = it->second;
+ map<int, int> temp;
+
+ //add link to eachother
+ temp[smallRow] = 1; // 6 2 3 5
+ linkTable.push_back(temp); // 6 0 1 2 0
+ linkTable[alreadyActiveRow][smallCol] = 1; // 2 1 0 1 1
+ // 3 2 1 0 0
+ // 5 0 1 0 0
+ //add to activeLinks
+ activeLinks[smallCol] = size;
+//cout << "active - here2" << endl;
+ }else if ((it == activeLinks.end()) && (it2 != activeLinks.end())) { //smallCol is active, smallRow is not
+ int size = linkTable.size();
+ int alreadyActiveCol = it2->second;
+ map<int, int> temp;
+
+ //add link to eachother
+ temp[smallCol] = 1; // 6 2 3 5
+ linkTable.push_back(temp); // 6 0 1 2 0
+ linkTable[alreadyActiveCol][smallRow] = 1; // 2 1 0 1 1
+ // 3 2 1 0 0
+ // 5 0 1 0 0
+ //add to activeLinks
+ activeLinks[smallRow] = size;
+//cout << "active - here3" << endl;
+ }else { //both are active so add one
+ int row = it->second;
+ int col = it2->second;
+//cout << row << '\t' << col << endl;
+
+ linkTable[row][smallCol]++;
+ linkTable[col][smallRow]++;
+ linkValue = linkTable[row][smallCol];
+//cout << "active - here4" << endl;
+ }
+
+ return linkValue;
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "makeActive");
+ exit(1);
+ }
+}
+/***********************************************************************/
+void HCluster::updateArrayandLinkTable() {
+ try {
+ //if cluster was made update clusterArray and linkTable
+ int size = clusterArray.size();
+
+ //add new node
+ clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
+ clusterArray.push_back(temp);
+
+ //update child nodes
+ clusterArray[smallRow].parent = size;
+ clusterArray[smallCol].parent = size;
+
+ //update linkTable by merging clustered rows and columns
+ int rowSpot = activeLinks[smallRow];
+ int colSpot = activeLinks[smallCol];
+ //cout << "here" << endl;
+ //fix old rows
+ for (int i = 0; i < linkTable.size(); i++) {
+ //check if they are in map
+ it = linkTable[i].find(smallRow);
+ it2 = linkTable[i].find(smallCol);
+
+ if ((it!=linkTable[i].end()) && (it2!=linkTable[i].end())) { //they are both there
+ linkTable[i][size] = linkTable[i][smallRow]+linkTable[i][smallCol];
+ linkTable[i].erase(smallCol); //delete col row
+ linkTable[i].erase(smallRow); //delete col row
+ }else if ((it==linkTable[i].end()) && (it2!=linkTable[i].end())) { //only col
+ linkTable[i][size] = linkTable[i][smallCol];
+ linkTable[i].erase(smallCol); //delete col
+ }else if ((it!=linkTable[i].end()) && (it2==linkTable[i].end())) { //only row
+ linkTable[i][size] = linkTable[i][smallRow];
+ linkTable[i].erase(smallRow); //delete col
+ }
+ }
+ //printInfo();
+//cout << "here2" << endl;
+ //merge their values
+ for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
+ it2 = linkTable[colSpot].find(it->first); //does the col also have this
+
+ if (it2 == linkTable[colSpot].end()) { //not there so add it
+ linkTable[colSpot][it->first] = it->second;
+ }else { //merge them
+ linkTable[colSpot][it->first] = it->second+it2->second;
+ }
+ }
+//cout << "here3" << endl;
+ linkTable[colSpot].erase(size);
+ linkTable.erase(linkTable.begin()+rowSpot); //delete col
+ //printInfo();
+ //update activerows
+ activeLinks.erase(smallRow);
+ activeLinks.erase(smallCol);
+
+ if(rowSpot>colSpot) { activeLinks[size] = colSpot; }
+ else{ activeLinks[size] = colSpot-1; }
+
+ //adjust everybody elses spot since you deleted - time vs. space
+ for (it = activeLinks.begin(); it != activeLinks.end(); it++) {
+ if (it->second > rowSpot) { activeLinks[it->first]--; }
+ }
+
+
+//cout << "here4" << endl;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "updateArrayandLinkTable");
+ exit(1);
+ }
+}
+/***********************************************************************/
+bool HCluster::update(int row, int col, float distance){
+ try {
+
+ smallRow = row;
+ smallCol = col;
+ smallDist = distance;
+ bool clustered = false;
+
+ //find upmost parent of row and col
+ smallRow = getUpmostParent(smallRow);
+ smallCol = getUpmostParent(smallCol);
+ //cout << "smallRow = " << smallRow << " smallCol = " << smallCol << endl;
+
+ //are they active in the link table
+ int linkValue = makeActive(); //after this point this nodes info is active in linkTable
+ //printInfo();
+//cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl;
+ //can we cluster???
+ if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {
+ updateArrayandLinkTable();
+ clusterBins();
+ clusterNames();
+ clustered = true;
+ //printInfo();
+ }
+
+
+ return clustered;
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "update");
+ exit(1);
+ }
+}
+
+
+/***********************************************************************/
+
+
--- /dev/null
+#ifndef HCLUSTER_H
+#define HCLUSTER_H
+
+/*
+ * hcluster.h
+ * Mothur
+ *
+ * Created by westcott on 10/13/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "mothur.h"
+
+class RAbundVector;
+class ListVector;
+/************************************************************/
+struct clusterNode {
+ int numSeq;
+ int parent;
+ int smallChild; //used to make linkTable work with list and rabund
+ clusterNode(int num, int par, int kid) : numSeq(num), parent(par), smallChild(kid) {};
+};
+
+/***********************************************************************/
+class HCluster {
+
+public:
+ HCluster(RAbundVector*, ListVector*);
+ bool update(int, int, float);
+ //string getTag();
+
+protected:
+ void clusterBins();
+ void clusterNames();
+ int getUpmostParent(int);
+ int makeActive();
+ void printInfo();
+ void updateArrayandLinkTable();
+
+ RAbundVector* rabund;
+ ListVector* list;
+
+ vector<clusterNode> clusterArray;
+ vector< map<int, int> > linkTable; // vector of maps - linkTable[1][6] = 2 would mean sequence in spot 1 has 2 links with sequence in 6
+ map<int, int> activeLinks; //maps sequence to index in linkTable
+ map<int, int>::iterator it;
+ map<int, int>::iterator it2;
+
+ int numSeqs;
+
+ int smallRow;
+ int smallCol;
+ float smallDist;
+
+};
+
+/***********************************************************************/
+
+
+
+
+
+
+
+#endif
+
+
--- /dev/null
+/*
+ * hclustercommand.cpp
+ * Mothur
+ *
+ * Created by westcott on 10/13/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "hclustercommand.h"
+
+//**********************************************************************************************************************
+//This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
+HClusterCommand::HClusterCommand(string option){
+ try{
+ globaldata = GlobalData::getInstance();
+ abort = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; }
+
+ else {
+ //valid paramters for this command
+ string Array[] = {"cutoff","precision","method","showabund","timing","phylip","column","name","sorted"};
+ vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+
+ OptionParser parser(option);
+ map<string,string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+
+ //check to make sure all parameters are valid for command
+ for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
+ abort = true;
+ }
+ }
+
+ globaldata->newRead();
+
+ //check for required parameters
+ phylipfile = validParameter.validFile(parameters, "phylip", true);
+ if (phylipfile == "not open") { abort = true; }
+ else if (phylipfile == "not found") { phylipfile = ""; }
+ else { distfile = phylipfile; format = "phylip"; }
+
+ columnfile = validParameter.validFile(parameters, "column", true);
+ if (columnfile == "not open") { abort = true; }
+ else if (columnfile == "not found") { columnfile = ""; }
+ else { distfile = columnfile; format = "column"; }
+
+ namefile = validParameter.validFile(parameters, "name", true);
+ if (namefile == "not open") { abort = true; }
+ else if (namefile == "not found") { namefile = ""; }
+
+ if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a cluster command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
+ else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a cluster command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
+
+ if (columnfile != "") {
+ if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
+ }
+
+ //check for optional parameter and set defaults
+ // ...at some point should added some additional type checking...
+ //get user cutoff and precision or use defaults
+ string temp;
+ temp = validParameter.validFile(parameters, "precision", false);
+ if (temp == "not found") { temp = "100"; }
+ //saves precision legnth for formatting below
+ length = temp.length();
+ convert(temp, precision);
+
+ temp = validParameter.validFile(parameters, "cutoff", false);
+ if (temp == "not found") { temp = "10"; }
+ convert(temp, cutoff);
+ cutoff += (5 / (precision * 10.0));
+
+ method = validParameter.validFile(parameters, "method", false);
+ if (method == "not found") { method = "nearest"; }
+
+ if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
+ else { mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
+
+ showabund = validParameter.validFile(parameters, "showabund", false);
+ if (showabund == "not found") { showabund = "T"; }
+
+ sort = validParameter.validFile(parameters, "sorted", false);
+ if (sort == "not found") { sort = "F"; }
+ sorted = isTrue(sort);
+
+ timing = validParameter.validFile(parameters, "timing", false);
+ if (timing == "not found") { timing = "F"; }
+
+
+ if (abort == false) {
+
+ fileroot = getRootName(distfile);
+
+ tag = "fn"; //until we figure out average and nearest methods
+
+ openOutputFile(fileroot+ tag + ".sabund", sabundFile);
+ openOutputFile(fileroot+ tag + ".rabund", rabundFile);
+ openOutputFile(fileroot+ tag + ".list", listFile);
+ }
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "HClusterCommand", "HClusterCommand");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+
+void HClusterCommand::help(){
+ try {
+ mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
+ mothurOut("The cluster command parameter options are method, cuttoff, precision, showabund and timing. No parameters are required.\n");
+ mothurOut("The cluster command should be in the following format: \n");
+ mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
+ mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
+ }
+ catch(exception& e) {
+ errorOut(e, "HClusterCommand", "help");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+
+HClusterCommand::~HClusterCommand(){}
+
+//**********************************************************************************************************************
+
+int HClusterCommand::execute(){
+ try {
+
+ if (abort == true) { return 0; }
+
+ if(namefile != ""){
+ globaldata->nameMap = new NameAssignment(namefile);
+ globaldata->nameMap->readMap();
+ }else{
+ globaldata->nameMap = NULL;
+ }
+
+ if (!sorted) {
+ read = new ReadCluster(distfile, cutoff);
+ read->setFormat(format);
+ read->read(globaldata->nameMap);
+ distfile = read->getOutputFile();
+
+ list = read->getListVector();
+ delete read;
+ }else {
+ list = new ListVector(globaldata->nameMap->getListVector());
+ }
+
+ //list vector made by read contains all sequence names
+ if(list != NULL){
+ rabund = new RAbundVector(list->getRAbundVector());
+ }else{
+ mothurOut("Error: no list vector!"); mothurOutEndLine(); return 0;
+ }
+
+
+
+ float previousDist = 0.00000;
+ float rndPreviousDist = 0.00000;
+ oldRAbund = *rabund;
+ oldList = *list;
+
+ print_start = true;
+ start = time(NULL);
+
+//cout << "here" << endl;
+ ifstream in;
+ openInputFile(distfile, in);
+ string firstName, secondName;
+ float distance;
+
+ cluster = new HCluster(rabund, list);
+ bool clusteredSomething;
+ vector<seqDist> seqs; seqs.resize(1); // to start loop
+ exitedBreak = false; //lets you know if there is a distance stored in next
+
+ while (seqs.size() != 0){
+
+ seqs = getSeqs(in);
+ random_shuffle(seqs.begin(), seqs.end());
+
+ if (seqs.size() == 0) { break; } //there are no more distances
+
+ for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
+
+ if (print_start && isTrue(timing)) {
+ mothurOut("Clustering (" + tag + ") dist " + toString(distance) + "/"
+ + toString(roundDist(distance, precision))
+ + "\t(precision: " + toString(precision) + ")");
+ cout.flush();
+ print_start = false;
+ }
+
+ ///cout << "before cluster update" << endl;
+ if (seqs[i].seq1 != seqs[i].seq2) {
+ clusteredSomething = cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
+
+ float rndDist = roundDist(seqs[i].dist, precision);
+ //cout << "after cluster update clusterSomething = " << clusteredSomething << " rndDist = " << rndDist << " rndPreviousDist = " << rndPreviousDist << endl;
+
+
+ if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
+ printData("unique");
+ }
+ else if((rndDist != rndPreviousDist)){
+ printData(toString(rndPreviousDist, length-1));
+ }
+
+ previousDist = seqs[i].dist;
+ rndPreviousDist = rndDist;
+ oldRAbund = *rabund;
+ oldList = *list;
+ }
+ }
+ }
+
+ in.close();
+
+ if (print_start && isTrue(timing)) {
+ //mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist)
+ //+ "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
+ cout.flush();
+ print_start = false;
+ }
+
+ if(previousDist <= 0.0000){
+ printData("unique");
+ }
+ else if(rndPreviousDist<cutoff){
+ printData(toString(rndPreviousDist, length-1));
+ }
+
+ //delete globaldata's copy of the sparsematrix and listvector to free up memory
+ delete globaldata->gListVector; globaldata->gListVector = NULL;
+
+ //saves .list file so you can do the collect, rarefaction and summary commands without doing a read.list
+ if (globaldata->getFormat() == "phylip") { globaldata->setPhylipFile(""); }
+ else if (globaldata->getFormat() == "column") { globaldata->setColumnFile(""); }
+
+ globaldata->setListFile(fileroot+ tag + ".list");
+ globaldata->setNameFile("");
+ globaldata->setFormat("list");
+
+ sabundFile.close();
+ rabundFile.close();
+ listFile.close();
+ if (isTrue(timing)) {
+ //mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster " + toString(ndist) + " distances"); mothurOutEndLine();
+ }
+ return 0;
+ }
+ catch(exception& e) {
+ errorOut(e, "HClusterCommand", "execute");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+
+void HClusterCommand::printData(string label){
+ try {
+ if (isTrue(timing)) {
+ mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins())
+ + "\tclusters. Updates: " + toString(loops)); mothurOutEndLine();
+ }
+ print_start = true;
+ loops = 0;
+ start = time(NULL);
+
+ oldRAbund.setLabel(label);
+ if (isTrue(showabund)) {
+ oldRAbund.getSAbundVector().print(cout);
+ }
+ oldRAbund.print(rabundFile);
+ oldRAbund.getSAbundVector().print(sabundFile);
+
+ oldList.setLabel(label);
+ oldList.print(listFile);
+ }
+ catch(exception& e) {
+ errorOut(e, "HClusterCommand", "printData");
+ exit(1);
+ }
+
+
+}
+//**********************************************************************************************************************
+
+vector<seqDist> HClusterCommand::getSeqs(ifstream& filehandle){
+ try {
+ string firstName, secondName;
+ float distance, prevDistance;
+ vector<seqDist> sameSeqs;
+ prevDistance = -1;
+
+ //if you are not at the beginning of the file
+ if (exitedBreak) {
+ sameSeqs.push_back(next);
+ prevDistance = next.dist;
+ exitedBreak = false;
+ }
+
+ //get entry
+ while (filehandle) {
+
+ filehandle >> firstName >> secondName >> distance;
+//cout << firstName << '\t' << secondName << '\t' << distance << endl;
+ gobble(filehandle);
+
+ //save first one
+ if (prevDistance == -1) { prevDistance = distance; }
+
+ map<string,int>::iterator itA = globaldata->nameMap->find(firstName);
+ map<string,int>::iterator itB = globaldata->nameMap->find(secondName);
+
+ if(itA == globaldata->nameMap->end()){
+ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";
+ }
+ if(itB == globaldata->nameMap->end()){
+ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n";
+ }
+
+ //using cutoff
+ if (distance > cutoff) { break; }
+
+ if (distance != -1) { //-1 means skip me
+
+ //are the distances the same
+ if (distance == prevDistance) { //save in vector
+ seqDist temp;
+ temp.seq1 = itA->second;
+ temp.seq2 = itB->second;
+ temp.dist = distance;
+ sameSeqs.push_back(temp);
+ exitedBreak = false;
+ //what about precision??
+
+ }else{
+ next.seq1 = itA->second;
+ next.seq2 = itB->second;
+ next.dist = distance;
+ exitedBreak = true;
+ break;
+ }
+
+ }
+ }
+
+ return sameSeqs;
+ }
+ catch(exception& e) {
+ errorOut(e, "HClusterCommand", "getSeqs");
+ exit(1);
+ }
+
+
+}
+
+//**********************************************************************************************************************
+
--- /dev/null
+#ifndef HCLUSTERCOMMAND_H
+#define HCLUSTERCOMMAND_H
+
+/*
+ * hclustercommand.h
+ * Mothur
+ *
+ * Created by westcott on 10/13/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "command.hpp"
+#include "globaldata.hpp"
+#include "hcluster.h"
+#include "rabundvector.hpp"
+#include "sabundvector.hpp"
+#include "listvector.hpp"
+#include "readcluster.h"
+
+/******************************************************************/
+//This command is an implementation of the HCluster algorythmn described in
+//ESPRIT: estimating species richness using large collections of 16S rRNA pyrosequences by
+//Yijun Sun1,2,*, Yunpeng Cai2, Li Liu1, Fahong Yu1, Michael L. Farrell3, William McKendree3
+//and William Farmerie1 1
+
+//Interdisciplinary Center for Biotechnology Research, 2Department of Electrical and Computer Engineering,
+//University of Florida, Gainesville, FL 32610-3622 and 3Materials Technology Directorate, Air Force Technical
+//Applications Center, 1030 S. Highway A1A, Patrick AFB, FL 32925-3002, USA
+//Received January 28, 2009; Revised April 14, 2009; Accepted April 15, 2009
+
+/******************************************************************/
+
+
+/************************************************************/
+struct seqDist {
+ int seq1;
+ int seq2;
+ float dist;
+};
+/************************************************************/
+class HClusterCommand : public Command {
+
+public:
+ HClusterCommand(string);
+ ~HClusterCommand();
+ int execute();
+ void help();
+
+private:
+ GlobalData* globaldata;
+ HCluster* cluster;
+ ListVector* list;
+ RAbundVector* rabund;
+ RAbundVector oldRAbund;
+ ListVector oldList;
+ ReadCluster* read;
+
+ bool abort;
+
+ string method, fileroot, tag, distfile, format, phylipfile, columnfile, namefile, sort;
+ double cutoff;
+ string showabund, timing;
+ int precision, length;
+ ofstream sabundFile, rabundFile, listFile;
+ //ifstream in;
+ seqDist next;
+ bool exitedBreak, sorted;
+
+ bool print_start;
+ time_t start;
+ unsigned long loops;
+
+ void printData(string label);
+ vector<seqDist> getSeqs(ifstream&);
+};
+
+/************************************************************/
+
+#endif
}
if ((anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = lookup[0]->getLabel();
+
for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
lookup = input->getSharedRAbundVectors(lastLabel);
mothurOut(lookup[0]->getLabel()); mothurOutEndLine();
processedLabels.insert(lookup[0]->getLabel());
userLabels.erase(lookup[0]->getLabel());
+
+ //restore real lastlabel to save below
+ lookup[0]->setLabel(saveLabel);
}
lastLabel = lookup[0]->getLabel();
}
if ((anyLabelsToProcess(rabund->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
-
+ string saveLabel = rabund->getLabel();
+
delete rabund;
rabund = input->getRAbundVector(lastLabel);
mothurOut(rabund->getLabel()); mothurOutEndLine();
processedLabels.insert(rabund->getLabel());
userLabels.erase(rabund->getLabel());
+
+ //restore real lastlabel to save below
+ rabund->setLabel(saveLabel);
}
}
if ((anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = lookup[0]->getLabel();
for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
lookup = input->getSharedRAbundVectors(lastLabel);
processedLabels.insert(lookup[0]->getLabel());
userLabels.erase(lookup[0]->getLabel());
+
+ //restore real lastlabel to save below
+ lookup[0]->setLabel(saveLabel);
}
//prevent memory leak
}
if ((anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
-
+ string saveLabel = lookup[0]->getLabel();
+
for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
lookup = input->getSharedRAbundVectors(lastLabel);
processedLabels.insert(lookup[0]->getLabel());
userLabels.erase(lookup[0]->getLabel());
+
+ //restore real lastlabel to save below
+ lookup[0]->setLabel(saveLabel);
}
lastLabel = lookup[0]->getLabel();
exit(1);
}
}
+//**********************************************************************************************************************
+void NameAssignment::push_back(string name) {
+ try{
+
+ int num = (*this).size();
+ (*this)[name] = num;
+
+ list.push_back(name);
+ }
+ catch(exception& e) {
+ errorOut(e, "NameAssignment", "push_back");
+ exit(1);
+ }
+}
//**********************************************************************************************************************
ListVector getListVector();
int get(string);
void print();
+ void push_back(string);
private:
ifstream fileHandle;
ListVector list;
}
if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = order->getLabel();
+
delete order;
order = (input->getOrderVector(lastLabel));
mothurOut(order->getLabel()); mothurOutEndLine();
processedLabels.insert(order->getLabel());
userLabels.erase(order->getLabel());
+
+ //restore real lastlabel to save below
+ order->setLabel(saveLabel);
}
lastLabel = order->getLabel();
}
if ((anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = lookup[0]->getLabel();
+
for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
lookup = input->getSharedRAbundVectors(lastLabel);
processedLabels.insert(lookup[0]->getLabel());
userLabels.erase(lookup[0]->getLabel());
+
+ //restore real lastlabel to save below
+ lookup[0]->setLabel(saveLabel);
}
--- /dev/null
+/*
+ * readcluster.cpp
+ * Mothur
+ *
+ * Created by westcott on 10/28/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "readcluster.h"
+
+/***********************************************************************/
+
+ReadCluster::ReadCluster(string distfile, float c){
+ distFile = distfile;
+ cutoff = c;
+}
+
+/***********************************************************************/
+
+void ReadCluster::read(NameAssignment* nameMap){
+ try {
+
+ if (format == "phylip") { convertPhylip2Column(nameMap); }
+ else { list = new ListVector(nameMap->getListVector()); }
+
+ createHClusterFile();
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ReadCluster", "read");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
+void ReadCluster::createHClusterFile(){
+ try {
+ string outfile = getRootName(distFile) + "sorted.dist";
+
+ //if you can, use the unix sort since its been optimized for years
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ string command = "sort -n -k +3 " + distFile + " -o " + outfile;
+ system(command.c_str());
+ #else //you are stuck with my best attempt...
+ //windows sort does not have a way to specify a column, only a character in the line
+ //since we cannot assume that the distance will always be at the the same character location on each line
+ //due to variable sequence name lengths, I chose to force the distance into first position, then sort and then put it back.
+
+ //read in file line by file and put distance first
+ string tempDistFile = distFile + ".temp";
+ ifstream input;
+ ofstream output;
+ openInputFile(distFile, input);
+ openOutputFile(tempDistFile, output);
+
+ string firstName, secondName;
+ float dist;
+ while (input) {
+ input >> firstName >> secondName >> dist;
+ output << dist << '\t' << firstName << '\t' << secondName << endl;
+ gobble(input);
+ }
+ input.close();
+ output.close();
+
+
+ //sort using windows sort
+ string tempOutfile = outfile + ".temp";
+ string command = "sort " + tempDistFile + " /O " + tempOutfile;
+ system(command.c_str());
+
+ //read in sorted file and put distance at end again
+ ifstream input2;
+ openInputFile(tempOutfile, input2);
+ openOutputFile(outfile, output);
+
+ while (input2) {
+ input2 >> dist >> firstName >> secondName;
+ output << firstName << '\t' << secondName << '\t' << dist << endl;
+ gobble(input2);
+ }
+ input2.close();
+ output.close();
+
+ //remove temp files
+ remove(tempDistFile.c_str());
+ remove(tempOutfile.c_str());
+ #endif
+
+ OutPutFile = outfile;
+ }
+ catch(exception& e) {
+ errorOut(e, "ReadCluster", "createHClusterFile");
+ exit(1);
+ }
+}
+
+
+/***********************************************************************/
+
+void ReadCluster::convertPhylip2Column(NameAssignment* nameMap){
+ try {
+ //convert phylip file to column file
+ map<int, string> rowToName;
+ map<int, string>::iterator it;
+
+ ifstream in;
+ ofstream out;
+ string tempFile = distFile + ".column.temp";
+
+ openInputFile(distFile, in);
+ openOutputFile(tempFile, out);
+
+ float distance;
+ int square, nseqs;
+ string name;
+ vector<string> matrixNames;
+
+ in >> nseqs >> name;
+ rowToName[0] = name;
+ matrixNames.push_back(name);
+
+ if(nameMap == NULL){
+ list = new ListVector(nseqs);
+ list->set(0, name);
+ }
+ else{
+ list = new ListVector(nameMap->getListVector());
+ if(nameMap->count(name)==0){ mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); mothurOutEndLine(); }
+ }
+
+ char d;
+ while((d=in.get()) != EOF){
+
+ if(isalnum(d)){
+ square = 1;
+ in.putback(d);
+ for(int i=0;i<nseqs;i++){
+ in >> distance;
+ }
+ break;
+ }
+ if(d == '\n'){
+ square = 0;
+ break;
+ }
+ }
+
+ if(square == 0){
+
+ for(int i=1;i<nseqs;i++){
+ in >> name;
+ rowToName[i] = name;
+ matrixNames.push_back(name);
+
+ //there's A LOT of repeated code throughout this method...
+ if(nameMap == NULL){
+ list->set(i, name);
+
+ for(int j=0;j<i;j++){
+ in >> distance;
+
+ if (distance == -1) { distance = 1000000; }
+
+ if(distance < cutoff){
+ out << i << '\t' << j << '\t' << distance << endl;
+ }
+ }
+
+ }
+ else{
+ if(nameMap->count(name)==0){ mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); mothurOutEndLine(); }
+
+ for(int j=0;j<i;j++){
+ in >> distance;
+
+ if (distance == -1) { distance = 1000000; }
+
+ if(distance < cutoff){
+ out << i << '\t' << j << '\t' << distance << endl;
+ }
+ }
+ }
+ }
+ }
+ else{
+
+ for(int i=1;i<nseqs;i++){
+ in >> name;
+ rowToName[i] = name;
+ matrixNames.push_back(name);
+
+ if(nameMap == NULL){
+ list->set(i, name);
+ for(int j=0;j<nseqs;j++){
+ in >> distance;
+
+ if (distance == -1) { distance = 1000000; }
+
+ if(distance < cutoff && j < i){
+ out << i << '\t' << j << '\t' << distance << endl;
+ }
+ }
+
+ }
+ else{
+ if(nameMap->count(name)==0){ mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); mothurOutEndLine(); }
+
+ for(int j=0;j<nseqs;j++){
+ in >> distance;
+
+ if (distance == -1) { distance = 1000000; }
+
+ if(distance < cutoff && j < i){
+ out << i << '\t' << j << '\t' << distance << endl;
+ }
+
+ }
+ }
+ }
+ }
+
+ list->setLabel("0");
+ in.close();
+ out.close();
+
+ if(nameMap == NULL){
+ for(int i=0;i<matrixNames.size();i++){
+ nameMap->push_back(matrixNames[i]);
+ }
+ }
+
+ ifstream in2;
+ ofstream out2;
+
+ string outputFile = getRootName(distFile) + "column.dist";
+ openInputFile(tempFile, in2);
+ openOutputFile(outputFile, out2);
+
+ int first, second;
+ float dist;
+
+ while (in2) {
+ in2 >> first >> second >> dist;
+ out2 << rowToName[first] << '\t' << rowToName[second] << '\t' << dist << endl;
+ gobble(in2);
+ }
+ in2.close();
+ out2.close();
+
+ remove(tempFile.c_str());
+ distFile = outputFile;
+ }
+ catch(exception& e) {
+ errorOut(e, "ReadCluster", "convertPhylip2Column");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
+ReadCluster::~ReadCluster(){}
+/***********************************************************************/
+
--- /dev/null
+#ifndef READCLUSTER_H
+#define READCLUSTER_H
+/*
+ * readcluster.h
+ * Mothur
+ *
+ * Created by westcott on 10/28/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "mothur.h"
+#include "nameassignment.hpp"
+#include "listvector.hpp"
+
+
+/******************************************************/
+
+class ReadCluster {
+
+public:
+ ReadCluster(string, float);
+ ~ReadCluster();
+ void read(NameAssignment*);
+ string getOutputFile() { return OutPutFile; }
+ void setFormat(string f) { format = f; }
+ ListVector* getListVector() { return list; }
+
+private:
+ string distFile;
+ string OutPutFile, format;
+ ListVector* list;
+ float cutoff;
+
+ void createHClusterFile();
+ void convertPhylip2Column(NameAssignment*);
+};
+
+/******************************************************/
+
+#endif
+
-/*
- * readcolumn.cpp
- * Mothur
- *
- * Created by Sarah Westcott on 4/21/09.
- * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
- *
- */
-
-#include "readcolumn.h"
-#include "progress.hpp"
-
-/***********************************************************************/
-
-ReadColumnMatrix::ReadColumnMatrix(string df) : distFile(df){
-
- successOpen = openInputFile(distFile, fileHandle);
-
-}
-
-/***********************************************************************/
-
-void ReadColumnMatrix::read(NameAssignment* nameMap){
- try {
-
- string firstName, secondName;
- float distance;
- int nseqs = nameMap->size();
-
- list = new ListVector(nameMap->getListVector());
-
- Progress* reading = new Progress("Reading matrix: ", nseqs * nseqs);
-
- int lt = 1;
- int refRow = 0; //we'll keep track of one cell - Cell(refRow,refCol) - and see if it's transpose
- int refCol = 0; //shows up later - Cell(refCol,refRow). If it does, then its a square matrix
-
- //need to see if this is a square or a triangular matrix...
-
- while(fileHandle && lt == 1){ //let's assume it's a triangular matrix...
-
- fileHandle >> firstName >> secondName >> distance; // get the row and column names and distance
-
- map<string,int>::iterator itA = nameMap->find(firstName);
- map<string,int>::iterator itB = nameMap->find(secondName);
-
- if(itA == nameMap->end()){
- cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";
- }
- if(itB == nameMap->end()){
- cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n";
- }
-
- if (distance == -1) { distance = 1000000; }
-
- if(distance < cutoff && itA != itB){
- if(itA->second > itB->second){
- PCell value(itA->second, itB->second, distance);
-
- if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol...
- refRow = itA->second;
- refCol = itB->second;
- D->addCell(value);
- }
- else if(refRow == itA->second && refCol == itB->second){
- lt = 0;
- }
- else{
- D->addCell(value);
- }
- }
- else if(itA->second < itB->second){
- PCell value(itB->second, itA->second, distance);
-
- if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol...
- refRow = itA->second;
- refCol = itB->second;
- D->addCell(value);
- }
- else if(refRow == itB->second && refCol == itA->second){
- lt = 0;
- }
- else{
- D->addCell(value);
- }
- }
- reading->update(itA->second * nseqs);
- }
- gobble(fileHandle);
- }
-
- if(lt == 0){ // oops, it was square
- fileHandle.close(); //let's start over
- D->clear(); //let's start over
-
- openInputFile(distFile, fileHandle); //let's start over
-
- while(fileHandle){
- fileHandle >> firstName >> secondName >> distance;
-
- map<string,int>::iterator itA = nameMap->find(firstName);
- map<string,int>::iterator itB = nameMap->find(secondName);
-
- if(itA == nameMap->end()){
- cerr << "BError: Sequence '" << firstName << "' was not found in the names file, please correct\n";
- }
- if(itB == nameMap->end()){
- cerr << "BError: Sequence '" << secondName << "' was not found in the names file, please correct\n";
- }
-
- if (distance == -1) { distance = 1000000; }
-
- if(distance < cutoff && itA->second > itB->second){
- PCell value(itA->second, itB->second, distance);
- D->addCell(value);
- reading->update(itA->second * nseqs);
- }
-
- gobble(fileHandle);
- }
- }
-
- reading->finish();
- fileHandle.close();
-
- list->setLabel("0");
-
- }
- catch(exception& e) {
- errorOut(e, "ReadColumnMatrix", "read");
- exit(1);
- }
-}
-
-/***********************************************************************/
-
-ReadColumnMatrix::~ReadColumnMatrix(){
- //delete D;
- //delete list;
-}
-
-
+/*\r
+ * readcolumn.cpp\r
+ * Mothur\r
+ *\r
+ * Created by Sarah Westcott on 4/21/09.\r
+ * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.\r
+ *\r
+ */\r
+\r
+#include "readcolumn.h"\r
+#include "progress.hpp"\r
+\r
+/***********************************************************************/\r
+\r
+ReadColumnMatrix::ReadColumnMatrix(string df) : distFile(df){\r
+ \r
+ successOpen = openInputFile(distFile, fileHandle);\r
+ \r
+}\r
+\r
+/***********************************************************************/\r
+\r
+void ReadColumnMatrix::read(NameAssignment* nameMap){\r
+ try { \r
+\r
+ string firstName, secondName;\r
+ float distance;\r
+ int nseqs = nameMap->size();\r
+\r
+ list = new ListVector(nameMap->getListVector());\r
+ \r
+ Progress* reading = new Progress("Reading matrix: ", nseqs * nseqs);\r
+\r
+ int lt = 1;\r
+ int refRow = 0; //we'll keep track of one cell - Cell(refRow,refCol) - and see if it's transpose\r
+ int refCol = 0; //shows up later - Cell(refCol,refRow). If it does, then its a square matrix\r
+\r
+ //need to see if this is a square or a triangular matrix...\r
+ \r
+ while(fileHandle && lt == 1){ //let's assume it's a triangular matrix...\r
+ \r
+ fileHandle >> firstName >> secondName >> distance; // get the row and column names and distance\r
+ \r
+ map<string,int>::iterator itA = nameMap->find(firstName);\r
+ map<string,int>::iterator itB = nameMap->find(secondName);\r
+ \r
+ if(itA == nameMap->end()){\r
+ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";\r
+ }\r
+ if(itB == nameMap->end()){\r
+ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n";\r
+ }\r
+\r
+ if (distance == -1) { distance = 1000000; }\r
+ \r
+ if(distance < cutoff && itA != itB){\r
+ if(itA->second > itB->second){\r
+ PCell value(itA->second, itB->second, distance);\r
+ \r
+ if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol...\r
+ refRow = itA->second;\r
+ refCol = itB->second;\r
+ D->addCell(value);\r
+ }\r
+ else if(refRow == itA->second && refCol == itB->second){\r
+ lt = 0;\r
+ }\r
+ else{\r
+ D->addCell(value);\r
+ }\r
+ }\r
+ else if(itA->second < itB->second){\r
+ PCell value(itB->second, itA->second, distance);\r
+ \r
+ if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol...\r
+ refRow = itA->second;\r
+ refCol = itB->second;\r
+ D->addCell(value);\r
+ }\r
+ else if(refRow == itB->second && refCol == itA->second){\r
+ lt = 0;\r
+ }\r
+ else{\r
+ D->addCell(value);\r
+ }\r
+ }\r
+ reading->update(itA->second * nseqs);\r
+ }\r
+ gobble(fileHandle);\r
+ }\r
+\r
+ if(lt == 0){ // oops, it was square\r
+ fileHandle.close(); //let's start over\r
+ D->clear(); //let's start over\r
+ \r
+ openInputFile(distFile, fileHandle); //let's start over\r
+\r
+ while(fileHandle){\r
+ fileHandle >> firstName >> secondName >> distance;\r
+ \r
+ map<string,int>::iterator itA = nameMap->find(firstName);\r
+ map<string,int>::iterator itB = nameMap->find(secondName);\r
+ \r
+ if(itA == nameMap->end()){\r
+ cerr << "BError: Sequence '" << firstName << "' was not found in the names file, please correct\n";\r
+ }\r
+ if(itB == nameMap->end()){\r
+ cerr << "BError: Sequence '" << secondName << "' was not found in the names file, please correct\n";\r
+ }\r
+ \r
+ if (distance == -1) { distance = 1000000; }\r
+ \r
+ if(distance < cutoff && itA->second > itB->second){\r
+ PCell value(itA->second, itB->second, distance);\r
+ D->addCell(value);\r
+ reading->update(itA->second * nseqs);\r
+ }\r
+ \r
+ gobble(fileHandle);\r
+ }\r
+ }\r
+\r
+ reading->finish();\r
+ fileHandle.close();\r
+\r
+ list->setLabel("0");\r
+\r
+ }\r
+ catch(exception& e) {\r
+ errorOut(e, "ReadColumnMatrix", "read");\r
+ exit(1);\r
+ }\r
+}\r
+\r
+/***********************************************************************/\r
+\r
+ReadColumnMatrix::~ReadColumnMatrix(){\r
+ //delete D;\r
+ //delete list;\r
+}\r
+\r
+\r
private:
ifstream fileHandle;
string distFile;
+
};
/******************************************************/
ReadDistCommand::~ReadDistCommand(){
if (abort == false) {
- if (format != "matrix") { delete read; delete nameMap; }
+ if (format != "matrix") {
+ delete read;
+ delete nameMap;
+ }
}
}
if (globaldata->gSparseMatrix != NULL) { delete globaldata->gSparseMatrix; }
globaldata->gSparseMatrix = read->getMatrix();
numDists = globaldata->gSparseMatrix->getNNodes();
-
+
int lines = cutoff / (1.0/precision);
vector<float> dist_cutoff(lines+1,0);
for (int i = 0; i <= lines;i++) {
goodNameOut.close();
badNameOut.close();
+ //we were unable to remove some of the bad sequences
+ if (badSeqNames.size() != 0) {
+ for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
+ mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
+ mothurOutEndLine();
+ }
+ }
+
if(groupfile != ""){
ifstream inputGroups;
inputGroups.close();
goodGroupOut.close();
badGroupOut.close();
+
+ //we were unable to remove some of the bad sequences
+ if (badSeqGroups.size() != 0) {
+ for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {
+ mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
+ mothurOutEndLine();
+ }
+ }
}
}
}
gobble(inputGroups);
}
+
+ //we were unable to remove some of the bad sequences
+ if (badSeqNames.size() != 0) {
+ for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
+ mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
+ mothurOutEndLine();
+ }
+ }
+
inputGroups.close();
goodGroupOut.close();
badGroupOut.close();
}
gobble(inputAlignReport);
}
+
+ //we were unable to remove some of the bad sequences
+ if (badSeqNames.size() != 0) {
+ for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
+ mothurOut("Your file does not include the sequence " + *it + " please correct.");
+ mothurOutEndLine();
+ }
+ }
+
inputAlignReport.close();
goodAlignReportOut.close();
badAlignReportOut.close();
//lookup.clear();
string errorOff = "no error";
+ //errorOff = "";
//read in listfile
read = new ReadOTUFile(globaldata->inputFileName);
//if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
set<string> processedLabels;
- set<string> userLabels = globaldata->labels;
-
+ set<string> userLabels = globaldata->labels;
while((SharedList != NULL) && ((globaldata->allLines == 1) || (userLabels.size() != 0))) {
}
if ((anyLabelsToProcess(SharedList->getLabel(), userLabels, errorOff) == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = SharedList->getLabel();
+
delete SharedList;
SharedList = input->getSharedListVector(lastLabel); //get new list vector to process
processedLabels.insert(SharedList->getLabel());
userLabels.erase(SharedList->getLabel());
+
+ //restore real lastlabel to save below
+ SharedList->setLabel(saveLabel);
}
//initialize bin values
for (int i = 0; i < thislookup.size(); i++) {
+//cout << "in printData " << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << endl;
out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t';
thislookup[i]->print(out);
float snpRateLeft = numSNPSLeft / (float) winSizeLeft;
float snpRateRight = numSNPSRight / (float) winSizeRight;
- float logR = log(snpRateLeft / snpRateRight) / log(2);
+ float logR = log(snpRateLeft / snpRateRight) / log(2.0);
// do not accept excess snp ratio on either side of the break
if (abs(logR) < 1 ) {
}
}
-// random_shuffle(mins.begin(), mins.end()); //randomize the order of the iterators in the mins vector
+ random_shuffle(mins.begin(), mins.end()); //randomize the order of the iterators in the mins vector
for(int i=0;i<mins.size();i++){
mins[i]->vectorMap = &mins[i]; //assign vectorMap to the address for the container
}
if ((anyLabelsToProcess(sabund->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = sabund->getLabel();
+
delete sabund;
sabund = input->getSAbundVector(lastLabel);
sumCalculators[i]->print(outputFileHandle);
}
outputFileHandle << endl;
+
+ //restore real lastlabel to save below
+ sabund->setLabel(saveLabel);
}
lastLabel = sabund->getLabel();
}
if ((anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = lookup[0]->getLabel();
+
for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
lookup = input->getSharedRAbundVectors(lastLabel);
processedLabels.insert(lookup[0]->getLabel());
userLabels.erase(lookup[0]->getLabel());
+
+ //restore real lastlabel to save below
+ lookup[0]->setLabel(saveLabel);
}
-
-
lastLabel = lookup[0]->getLabel();
}
if ((anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = lookup[0]->getLabel();
+
for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
lookup = input->getSharedRAbundVectors(lastLabel);
processedLabels.insert(lookup[0]->getLabel());
userLabels.erase(lookup[0]->getLabel());
+
+ //restore real lastlabel to save below
+ lookup[0]->setLabel(saveLabel);
}
lastLabel = lookup[0]->getLabel();
}
if ((anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = lookup[0]->getLabel();
+
for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
lookup = input->getSharedRAbundVectors(lastLabel);
for (int i = lookup.size(); i > 4; i--) { lookup.pop_back(); } //no memmory leak because pop_back calls destructor
}
venn->getPic(lookup, vennCalculators);
+
+ //restore real lastlabel to save below
+ lookup[0]->setLabel(saveLabel);
}
}
if ((anyLabelsToProcess(sabund->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = sabund->getLabel();
+
delete sabund;
sabund = input->getSAbundVector(lastLabel);
processedLabels.insert(sabund->getLabel());
userLabels.erase(sabund->getLabel());
+
+ //restore real lastlabel to save below
+ sabund->setLabel(saveLabel);
}
lastLabel = sabund->getLabel();