#include "sparsematrix.hpp"
/***********************************************************************/
-HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAssignment* n, float c) : rabund(rav), list(lv), method(m), distfile(d), nameMap(n), cutoff(c) {
+HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) : rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) {
try {
+ m = MothurOut::getInstance();
mapWanted = false;
exitedBreak = false;
numSeqs = list->getNumSeqs();
clusterArray.push_back(temp);
}
- if (method != "average") {
- openInputFile(distfile, filehandle);
+ if ((method == "furthest") || (method == "nearest")) {
+ m->openInputFile(distfile, filehandle);
}else{
processFile();
}
}
catch(exception& e) {
- errorOut(e, "HCluster", "HCluster");
+ m->errorOut(e, "HCluster", "HCluster");
exit(1);
}
}
//cout << '\t' << rabund->get(clusterArray[smallRow].smallChild) << '\t' << rabund->get(clusterArray[smallCol].smallChild) << endl;
}
catch(exception& e) {
- errorOut(e, "HCluster", "clusterBins");
+ m->errorOut(e, "HCluster", "clusterBins");
exit(1);
}
}
catch(exception& e) {
- errorOut(e, "HCluster", "clusterNames");
+ m->errorOut(e, "HCluster", "clusterNames");
exit(1);
}
return node;
}
catch(exception& e) {
- errorOut(e, "HCluster", "getUpmostParent");
+ m->errorOut(e, "HCluster", "getUpmostParent");
exit(1);
}
}
}
catch(exception& e) {
- errorOut(e, "HCluster", "getUpmostParent");
+ m->errorOut(e, "HCluster", "getUpmostParent");
exit(1);
}
}
return linkValue;
}
catch(exception& e) {
- errorOut(e, "HCluster", "makeActive");
+ m->errorOut(e, "HCluster", "makeActive");
exit(1);
}
}
}
}
catch(exception& e) {
- errorOut(e, "HCluster", "updateArrayandLinkTable");
+ m->errorOut(e, "HCluster", "updateArrayandLinkTable");
exit(1);
}
}
/***********************************************************************/
-bool HCluster::update(int row, int col, float distance){
+double HCluster::update(int row, int col, float distance){
try {
bool cluster = false;
smallRow = row;
//you don't want to cluster with yourself
if (smallRow != smallCol) {
- if (method != "average") {
+ if ((method == "furthest") || (method == "nearest")) {
//can we cluster???
if (method == "nearest") { cluster = true; }
else{ //assume furthest
}
}
- return cluster;
+ return cutoff;
//printInfo();
}
catch(exception& e) {
- errorOut(e, "HCluster", "update");
+ m->errorOut(e, "HCluster", "update");
exit(1);
}
}
/***********************************************************************/
-void HCluster::setMapWanted(bool m) {
+void HCluster::setMapWanted(bool ms) {
try {
- mapWanted = m;
+ mapWanted = ms;
//initialize map
for (int i = 0; i < list->getNumBins(); i++) {
}
catch(exception& e) {
- errorOut(e, "HCluster", "setMapWanted");
+ m->errorOut(e, "HCluster", "setMapWanted");
exit(1);
}
}
seq2Bin[names] = clusterArray[smallCol].smallChild;
}
catch(exception& e) {
- errorOut(e, "HCluster", "updateMap");
+ m->errorOut(e, "HCluster", "updateMap");
exit(1);
}
}
try {
vector<seqDist> sameSeqs;
- if(method != "average") {
+ if ((method == "furthest") || (method == "nearest")) {
sameSeqs = getSeqsFNNN();
}else{
sameSeqs = getSeqsAN();
return sameSeqs;
}
catch(exception& e) {
- errorOut(e, "HCluster", "getSeqs");
+ m->errorOut(e, "HCluster", "getSeqs");
exit(1);
}
}
//get entry
while (!filehandle.eof()) {
- filehandle >> firstName >> secondName >> distance; gobble(filehandle);
+ filehandle >> firstName >> secondName >> distance; m->gobble(filehandle);
//save first one
if (prevDistance == -1) { prevDistance = distance; }
map<string,int>::iterator itA = nameMap->find(firstName);
map<string,int>::iterator itB = nameMap->find(secondName);
- if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
- if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
+ if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
+ if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
//using cutoff
if (distance > cutoff) { break; }
return sameSeqs;
}
catch(exception& e) {
- errorOut(e, "HCluster", "getSeqsFNNN");
+ m->errorOut(e, "HCluster", "getSeqsFNNN");
exit(1);
}
}
//**********************************************************************************************************************
-//don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
vector<seqDist> HCluster::getSeqsAN(){
try {
int firstName, secondName;
vector<seqDist> sameSeqs;
prevDistance = -1;
- openInputFile(distfile, filehandle, "no error");
+ m->openInputFile(distfile, filehandle, "no error");
//is the smallest value in mergedMin or the distfile?
float mergedMinDist = 10000;
if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist; }
if (!filehandle.eof()) {
- filehandle >> firstName >> secondName >> distance; gobble(filehandle);
+ filehandle >> firstName >> secondName >> distance; m->gobble(filehandle);
//save first one
if (prevDistance == -1) { prevDistance = distance; }
if (distance != -1) { //-1 means skip me
seqDist temp(firstName, secondName, distance);
sameSeqs.push_back(temp);
- }
+ }else{ distance = 10000; }
}
if (mergedMinDist < distance) { //get minimum distance from mergedMin
//get entry
while (!filehandle.eof()) {
- filehandle >> firstName >> secondName >> distance; gobble(filehandle);
+ filehandle >> firstName >> secondName >> distance; m->gobble(filehandle);
if (prevDistance == -1) { prevDistance = distance; }
return temp;
}
catch(exception& e) {
- errorOut(e, "HCluster", "getSeqsAN");
+ m->errorOut(e, "HCluster", "getSeqsAN");
exit(1);
}
}
/***********************************************************************/
-void HCluster::combineFile() {
+int HCluster::combineFile() {
try {
//int bufferSize = 64000; //512k - this should be a variable that the user can set to optimize code to their hardware
//char* inputBuffer;
string tempDistFile = distfile + ".temp";
ofstream out;
- openOutputFile(tempDistFile, out);
+ m->openOutputFile(tempDistFile, out);
//FILE* in;
//in = fopen(distfile.c_str(), "rb");
ifstream in;
- openInputFile(distfile, in);
+ m->openInputFile(distfile, in, "no error");
int first, second;
float dist;
//since file is sorted and mergedMin is sorted
//you can put the smallest distance from each through the code below and keep the file sorted
- in >> first >> second >> dist; gobble(in);
+ in >> first >> second >> dist; m->gobble(in);
+
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempDistFile); return 0; }
//while there are still values in mergedMin that are smaller than the distance read from file
while (count < mergedMin.size()) {
smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
}else { //if no, write to temp file
//outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
- out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+ //if (mergedMin[count].dist < cutoff) {
+ out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+ //}
}
count++;
}else{ break; }
}else { //if no, write to temp file
//outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
- out << first << '\t' << second << '\t' << dist << endl;
+ //if (dist < cutoff) {
+ out << first << '\t' << second << '\t' << dist << endl;
+ //}
}
}
smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
}else { //if no, write to temp file
- out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+ //if (mergedMin[count].dist < cutoff) {
+ out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+ //}
}
count++;
}
mergedMin.clear();
//rename tempfile to distfile
- remove(distfile.c_str());
+ m->mothurRemove(distfile);
rename(tempDistFile.c_str(), distfile.c_str());
-
+//cout << "remove = "<< renameOK << " rename = " << ok << endl;
+
//merge clustered rows averaging the distances
map<int, float>::iterator itMerge;
map<int, float>::iterator it2Merge;
float average;
if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
- //weighted average
- int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
- average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
+ //average
+ if (method == "average") {
+ int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
+ average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
+ }else { //weighted
+ average = ((itMerge->second * 1.0) + (it2Merge->second * 1.0)) / (float) 2.0;
+ }
+
smallRowColValues[1].erase(it2Merge);
seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
mergedMin.push_back(temp);
+ }else {
+ //can't find value so update cutoff
+ if (cutoff > itMerge->second) { cutoff = itMerge->second; }
}
}
-
+
+ //update cutoff
+ for(itMerge = smallRowColValues[1].begin(); itMerge != smallRowColValues[1].end(); itMerge++) {
+ if (cutoff > itMerge->second) { cutoff = itMerge->second; }
+ }
+
//sort merged values
sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);
+
+ return 0;
}
catch(exception& e) {
- errorOut(e, "HCluster", "combineFile");
+ m->errorOut(e, "HCluster", "combineFile");
exit(1);
}
}
if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
gotDist = true;
- //gobble space
+ //m->gobble space
while (index < size) {
if (isspace(buffer[index])) { index++; }
else { break; }
return next;
}
catch(exception& e) {
- errorOut(e, "HCluster", "getNextDist");
+ m->errorOut(e, "HCluster", "getNextDist");
exit(1);
}
}
/***********************************************************************/
-void HCluster::processFile() {
+int HCluster::processFile() {
try {
string firstName, secondName;
float distance;
ifstream in;
- openInputFile(distfile, in);
+ m->openInputFile(distfile, in, "no error");
ofstream out;
string outTemp = distfile + ".temp";
- openOutputFile(outTemp, out);
+ m->openOutputFile(outTemp, out);
//get entry
while (!in.eof()) {
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outTemp); return 0; }
- in >> firstName >> secondName >> distance; gobble(in);
+ in >> firstName >> secondName >> distance; m->gobble(in);
map<string,int>::iterator itA = nameMap->find(firstName);
map<string,int>::iterator itB = nameMap->find(secondName);
- if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
- if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
+ if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
+ if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
//using cutoff
if (distance > cutoff) { break; }
in.close();
out.close();
- remove(distfile.c_str());
+ m->mothurRemove(distfile);
rename(outTemp.c_str(), distfile.c_str());
+
+ return 0;
}
catch(exception& e) {
- errorOut(e, "HCluster", "processFile");
+ m->errorOut(e, "HCluster", "processFile");
exit(1);
}
}