+void HCluster::combineFile() {
+ try {
+ int bufferSize = 64000; //512k - this should be a variable that the user can set to optimize code to their hardware
+ char* inputBuffer;
+ inputBuffer = new char[bufferSize];
+ size_t numRead;
+
+ string tempDistFile = distfile + ".temp";
+ ofstream out;
+ openOutputFile(tempDistFile, out);
+
+ FILE* in;
+ in = fopen(distfile.c_str(), "rb");
+
+ int first, second;
+ float dist;
+
+ vector< map<int, float> > smallRowColValues;
+ smallRowColValues.resize(2); //0 = row, 1 = col
+ int count = 0;
+
+ //go through file pulling out distances related to rows merging
+ //if mergedMin contains distances add those back into file
+ bool done = false;
+ partialDist = "";
+ while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
+//cout << "number of char read = " << numRead << endl;
+//cout << inputBuffer << endl;
+ if (numRead < bufferSize) { done = true; }
+
+ //parse input into individual distances
+ int spot = 0;
+ string outputString = "";
+ while(spot < numRead) {
+ //cout << "spot = " << spot << endl;
+ seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
+
+ //you read a partial distance
+ if (nextDist.seq1 == -1) { break; }
+
+ first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
+ //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;
+ //since file is sorted and mergedMin is sorted
+ //you can put the smallest distance from each through the code below and keep the file sorted
+
+ //while there are still values in mergedMin that are smaller than the distance read from file
+ while (count < mergedMin.size()) {
+
+ //is the distance in mergedMin smaller than from the file
+ if (mergedMin[count].dist < dist) {
+ //is this a distance related to the columns merging?
+ //if yes, save in memory
+ if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
+ }else if (mergedMin[count].seq1 == smallCol) {
+ smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq2 == smallCol) {
+ smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq1 == smallRow) {
+ smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq2 == smallRow) {
+ smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
+ }else { //if no, write to temp file
+ outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
+ }
+ count++;
+ }else{ break; }
+ }
+
+ //is this a distance related to the columns merging?
+ //if yes, save in memory
+ if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
+ }else if (first == smallCol) {
+ smallRowColValues[1][second] = dist;
+ }else if (second == smallCol) {
+ smallRowColValues[1][first] = dist;
+ }else if (first == smallRow) {
+ smallRowColValues[0][second] = dist;
+ }else if (second == smallRow) {
+ smallRowColValues[0][first] = dist;
+
+ }else { //if no, write to temp file
+ outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
+ }
+ }
+
+ out << outputString;
+ if(done) { break; }
+ }
+ fclose(in);
+
+ //if values in mergedMin are larger than the the largest in file then
+ while (count < mergedMin.size()) {
+ //is this a distance related to the columns merging?
+ //if yes, save in memory
+ if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
+ }else if (mergedMin[count].seq1 == smallCol) {
+ smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq2 == smallCol) {
+ smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq1 == smallRow) {
+ smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq2 == smallRow) {
+ smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
+
+ }else { //if no, write to temp file
+ out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+ }
+ count++;
+ }
+ out.close();
+ mergedMin.clear();
+
+ //rename tempfile to distfile
+ remove(distfile.c_str());
+ rename(tempDistFile.c_str(), distfile.c_str());
+
+ //merge clustered rows averaging the distances
+ map<int, float>::iterator itMerge;
+ map<int, float>::iterator it2Merge;
+ for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {
+ //does smallRowColValues[1] have a distance to this seq too?
+ it2Merge = smallRowColValues[1].find(itMerge->first);
+
+ float average;
+ if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
+ //weighted average
+ int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
+ average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
+ smallRowColValues[1].erase(it2Merge);
+
+ seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
+ mergedMin.push_back(temp);
+ }
+ }
+
+ //sort merged values
+ sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "combineFile");
+ exit(1);
+ }
+}
+/***********************************************************************/
+seqDist HCluster::getNextDist(char* buffer, int& index, int size){
+ try {
+ seqDist next;
+ int indexBefore = index;
+ string first, second, distance;
+ first = ""; second = ""; distance = "";
+ int tabCount = 0;
+//cout << "partial = " << partialDist << endl;
+ if (partialDist != "") { //read what you can, you know it is less than a whole distance.
+ for (int i = 0; i < partialDist.size(); i++) {
+ if (tabCount == 0) {
+ if (partialDist[i] == '\t') { tabCount++; }
+ else { first += partialDist[i]; }
+ }else if (tabCount == 1) {
+ if (partialDist[i] == '\t') { tabCount++; }
+ else { second += partialDist[i]; }
+ }else if (tabCount == 2) {
+ distance += partialDist[i];
+ }
+ }
+ partialDist = "";
+ }
+
+ //try to get another distance
+ bool gotDist = false;
+ while (index < size) {
+ if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
+ gotDist = true;
+
+ //gobble space
+ while (index < size) {
+ if (isspace(buffer[index])) { index++; }
+ else { break; }
+ }
+ break;
+ }else{
+ if (tabCount == 0) {
+ if (buffer[index] == '\t') { tabCount++; }
+ else { first += buffer[index]; }
+ }else if (tabCount == 1) {
+ if (buffer[index] == '\t') { tabCount++; }
+ else { second += buffer[index]; }
+ }else if (tabCount == 2) {
+ distance += buffer[index];
+ }
+ index++;
+ }
+ }
+
+ //there was not a whole distance in the buffer, ie. buffer = "1 2 0.01 2 3 0."
+ //then you want to save the partial distance.
+ if (!gotDist) {
+ for (int i = indexBefore; i < size; i++) {
+ partialDist += buffer[i];
+ }
+ index = size + 1;
+ next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
+ }else{
+ int firstname, secondname;
+ float dist;
+
+ convert(first, firstname);
+ convert(second, secondname);
+ convert(distance, dist);
+
+ next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
+ }
+
+ return next;
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "getNextDist");
+ exit(1);
+ }
+}
+/***********************************************************************/
+void HCluster::processFile() {
+ try {
+ string firstName, secondName;
+ float distance;
+
+ ifstream in;
+ openInputFile(distfile, in);
+
+ ofstream out;
+ string outTemp = distfile + ".temp";
+ openOutputFile(outTemp, out);
+
+ //get entry
+ while (!in.eof()) {
+
+ in >> firstName >> secondName >> distance; gobble(in);
+
+ map<string,int>::iterator itA = nameMap->find(firstName);
+ map<string,int>::iterator itB = nameMap->find(secondName);
+ if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
+ if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
+
+ //using cutoff
+ if (distance > cutoff) { break; }
+
+ if (distance != -1) { //-1 means skip me
+ out << itA->second << '\t' << itB->second << '\t' << distance << endl;
+ }
+ }
+
+ in.close();
+ out.close();
+
+ remove(distfile.c_str());
+ rename(outTemp.c_str(), distfile.c_str());
+
+ firstRead = false;
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "processFile");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
+
+
+
+