+/***********************************************************************/
+void HCluster::setMapWanted(bool ms) {
+ try {
+ mapWanted = ms;
+
+ //initialize map
+ for (int i = 0; i < list->getNumBins(); i++) {
+
+ //parse bin
+ string names = list->get(i);
+ while (names.find_first_of(',') != -1) {
+ //get name from bin
+ string name = names.substr(0,names.find_first_of(','));
+ //save name and bin number
+ seq2Bin[name] = i;
+ names = names.substr(names.find_first_of(',')+1, names.length());
+ }
+
+ //get last name
+ seq2Bin[names] = i;
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "HCluster", "setMapWanted");
+ exit(1);
+ }
+}
+/***********************************************************************/
+void HCluster::updateMap() {
+try {
+ //update location of seqs in smallRow since they move to smallCol now
+ string names = list->get(clusterArray[smallRow].smallChild);
+ while (names.find_first_of(',') != -1) {
+ //get name from bin
+ string name = names.substr(0,names.find_first_of(','));
+ //save name and bin number
+ seq2Bin[name] = clusterArray[smallCol].smallChild;
+ names = names.substr(names.find_first_of(',')+1, names.length());
+ }
+
+ //get last name
+ seq2Bin[names] = clusterArray[smallCol].smallChild;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "HCluster", "updateMap");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<seqDist> HCluster::getSeqs(){
+ try {
+ vector<seqDist> sameSeqs;
+
+ if(method != "average") {
+ sameSeqs = getSeqsFNNN();
+ }else{
+ sameSeqs = getSeqsAN();
+ }
+
+ return sameSeqs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "HCluster", "getSeqs");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<seqDist> HCluster::getSeqsFNNN(){
+ try {
+ string firstName, secondName;
+ float distance, prevDistance;
+ vector<seqDist> sameSeqs;
+ prevDistance = -1;
+
+ //if you are not at the beginning of the file
+ if (exitedBreak) {
+ sameSeqs.push_back(next);
+ prevDistance = next.dist;
+ exitedBreak = false;
+ }
+
+ //get entry
+ while (!filehandle.eof()) {
+
+ filehandle >> firstName >> secondName >> distance; gobble(filehandle);
+
+ //save first one
+ if (prevDistance == -1) { prevDistance = distance; }
+
+ map<string,int>::iterator itA = nameMap->find(firstName);
+ map<string,int>::iterator itB = nameMap->find(secondName);
+ if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
+ if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
+
+ //using cutoff
+ if (distance > cutoff) { break; }
+
+ if (distance != -1) { //-1 means skip me
+
+ //are the distances the same
+ if (distance == prevDistance) { //save in vector
+ seqDist temp(itA->second, itB->second, distance);
+ sameSeqs.push_back(temp);
+ exitedBreak = false;
+ }else{
+ next.seq1 = itA->second;
+ next.seq2 = itB->second;
+ next.dist = distance;
+ exitedBreak = true;
+ break;
+ }
+ }
+ }
+
+ //rndomize matching dists
+ random_shuffle(sameSeqs.begin(), sameSeqs.end());
+
+ return sameSeqs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "HCluster", "getSeqsFNNN");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+//don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
+vector<seqDist> HCluster::getSeqsAN(){
+ try {
+ int firstName, secondName;
+ float prevDistance;
+ vector<seqDist> sameSeqs;
+ prevDistance = -1;
+
+ openInputFile(distfile, filehandle, "no error");
+
+ //is the smallest value in mergedMin or the distfile?
+ float mergedMinDist = 10000;
+ float distance = 10000;
+ if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist; }
+
+ if (!filehandle.eof()) {
+ filehandle >> firstName >> secondName >> distance; gobble(filehandle);
+ //save first one
+ if (prevDistance == -1) { prevDistance = distance; }
+ if (distance != -1) { //-1 means skip me
+ seqDist temp(firstName, secondName, distance);
+ sameSeqs.push_back(temp);
+ }else{ distance = 10000; }
+ }
+
+ if (mergedMinDist < distance) { //get minimum distance from mergedMin
+ //remove distance we saved from file
+ sameSeqs.clear();
+ prevDistance = mergedMinDist;
+
+ for (int i = 0; i < mergedMin.size(); i++) {
+ if (mergedMin[i].dist == prevDistance) {
+ sameSeqs.push_back(mergedMin[i]);
+ }else { break; }
+ }
+ }else{ //get minimum from file
+ //get entry
+ while (!filehandle.eof()) {
+
+ filehandle >> firstName >> secondName >> distance; gobble(filehandle);
+
+ if (prevDistance == -1) { prevDistance = distance; }
+
+ if (distance != -1) { //-1 means skip me
+ //are the distances the same
+ if (distance == prevDistance) { //save in vector
+ seqDist temp(firstName, secondName, distance);
+ sameSeqs.push_back(temp);
+ }else{
+ break;
+ }
+ }
+ }
+ }
+ filehandle.close();
+
+ //randomize matching dists
+ random_shuffle(sameSeqs.begin(), sameSeqs.end());
+
+ //can only return one value since once these are merged the other distances in sameSeqs may have changed
+ vector<seqDist> temp;
+ if (sameSeqs.size() > 0) { temp.push_back(sameSeqs[0]); }
+
+ return temp;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "HCluster", "getSeqsAN");
+ exit(1);
+ }
+}
+
+/***********************************************************************/
+int HCluster::combineFile() {
+ try {
+ //int bufferSize = 64000; //512k - this should be a variable that the user can set to optimize code to their hardware
+ //char* inputBuffer;
+ //inputBuffer = new char[bufferSize];
+ //size_t numRead;
+
+ string tempDistFile = distfile + ".temp";
+ ofstream out;
+ openOutputFile(tempDistFile, out);
+
+ //FILE* in;
+ //in = fopen(distfile.c_str(), "rb");
+
+ ifstream in;
+ openInputFile(distfile, in);
+
+ int first, second;
+ float dist;
+
+ vector< map<int, float> > smallRowColValues;
+ smallRowColValues.resize(2); //0 = row, 1 = col
+ int count = 0;
+
+ //go through file pulling out distances related to rows merging
+ //if mergedMin contains distances add those back into file
+ //bool done = false;
+ //partialDist = "";
+ //while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
+//cout << "number of char read = " << numRead << endl;
+//cout << inputBuffer << endl;
+ //if (numRead < bufferSize) { done = true; }
+
+ //parse input into individual distances
+ //int spot = 0;
+ //string outputString = "";
+ //while(spot < numRead) {
+ //cout << "spot = " << spot << endl;
+ // seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
+
+ //you read a partial distance
+ // if (nextDist.seq1 == -1) { break; }
+ while (!in.eof()) {
+ //first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
+ //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;
+ //since file is sorted and mergedMin is sorted
+ //you can put the smallest distance from each through the code below and keep the file sorted
+
+ in >> first >> second >> dist; gobble(in);
+
+ if (m->control_pressed) { in.close(); out.close(); remove(tempDistFile.c_str()); return 0; }
+
+ //while there are still values in mergedMin that are smaller than the distance read from file
+ while (count < mergedMin.size()) {
+
+ //is the distance in mergedMin smaller than from the file
+ if (mergedMin[count].dist < dist) {
+ //is this a distance related to the columns merging?
+ //if yes, save in memory
+ if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
+ }else if (mergedMin[count].seq1 == smallCol) {
+ smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq2 == smallCol) {
+ smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq1 == smallRow) {
+ smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq2 == smallRow) {
+ smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
+ }else { //if no, write to temp file
+ //outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
+ out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+ }
+ count++;
+ }else{ break; }
+ }
+
+ //is this a distance related to the columns merging?
+ //if yes, save in memory
+ if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
+ }else if (first == smallCol) {
+ smallRowColValues[1][second] = dist;
+ }else if (second == smallCol) {
+ smallRowColValues[1][first] = dist;
+ }else if (first == smallRow) {
+ smallRowColValues[0][second] = dist;
+ }else if (second == smallRow) {
+ smallRowColValues[0][first] = dist;
+
+ }else { //if no, write to temp file
+ //outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
+ out << first << '\t' << second << '\t' << dist << endl;
+ }
+ }
+
+ //out << outputString;
+ //if(done) { break; }
+ //}
+ //fclose(in);
+ in.close();
+
+ //if values in mergedMin are larger than the the largest in file then
+ while (count < mergedMin.size()) {
+ //is this a distance related to the columns merging?
+ //if yes, save in memory
+ if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
+ }else if (mergedMin[count].seq1 == smallCol) {
+ smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq2 == smallCol) {
+ smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq1 == smallRow) {
+ smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq2 == smallRow) {
+ smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
+
+ }else { //if no, write to temp file
+ out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+ }
+ count++;
+ }
+ out.close();
+ mergedMin.clear();
+
+ //rename tempfile to distfile
+ remove(distfile.c_str());
+ rename(tempDistFile.c_str(), distfile.c_str());
+//cout << "remove = "<< renameOK << " rename = " << ok << endl;