#include "splitmatrix.h"
#include "phylotree.h"
+#include "distancecommand.h"
+#include "seqsummarycommand.h"
/***********************************************************************/
-SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t){
+SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
m = MothurOut::getInstance();
distFile = distfile;
cutoff = c;
namefile = name;
method = t;
taxFile = tax;
+ large = l;
+}
+/***********************************************************************/
+
+SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, string output){
+ m = MothurOut::getInstance();
+ fastafile = ffile;
+ namefile = name;
+ taxFile = tax;
+ cutoff = c; //tax level cutoff
+ distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
+ method = t;
+ processors = p;
+ outputDir = output;
}
/***********************************************************************/
if (method == "distance") {
splitDistance();
- }else if (method == "classify") {
+ }else if ((method == "classify") || (method == "fasta")) {
splitClassify();
}else {
m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
int SplitMatrix::splitDistance(){
try {
- vector<set<string> > groups;
+ if (large) { splitDistanceLarge(); }
+ else { splitDistanceRAM(); }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitMatrix", "splitDistance");
+ exit(1);
+ }
+}
+
+/***********************************************************************/
+int SplitMatrix::splitClassify(){
+ try {
+ cutoff = int(cutoff);
+
+ map<string, int> seqGroup;
+ map<string, int>::iterator it;
+ map<string, int>::iterator it2;
+
+ int numGroups = 0;
+
+ //build tree from users taxonomy file
+ PhyloTree* phylo = new PhyloTree();
+
+ ifstream in;
+ m->openInputFile(taxFile, in);
+
+ //read in users taxonomy file and add sequences to tree
+ string seqname, tax;
+ while(!in.eof()){
+ in >> seqname >> tax; m->gobble(in);
+ phylo->addSeqToTree(seqname, tax);
+ }
+ in.close();
+
+ phylo->assignHeirarchyIDs(0);
+
+ //make sure the cutoff is not greater than maxlevel
+ if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
+
+ //for each node in tree
+ for (int i = 0; i < phylo->getNumNodes(); i++) {
+
+ //is this node within the cutoff
+ TaxNode taxon = phylo->get(i);
+
+ if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
+ if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
+ for (int j = 0; j < taxon.accessions.size(); j++) {
+ seqGroup[taxon.accessions[j]] = numGroups;
+ }
+ numGroups++;
+ }
+ }
+ }
+
+ delete phylo;
+
+ if (method == "classify") {
+ splitDistanceFileByTax(seqGroup, numGroups);
+ }else {
+ createDistanceFilesFromTax(seqGroup, numGroups);
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitMatrix", "splitClassify");
+ exit(1);
+ }
+}
+/***********************************************************************/
+int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
+ try {
+ map<string, int> copyGroups = seqGroup;
+ map<string, int>::iterator it;
+ set<string> names;
+
+ for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+ m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
+ }
+
+ ifstream in;
+ m->openInputFile(fastafile, in);
+
+ //parse fastafile
+ ofstream outFile;
+ while (!in.eof()) {
+ Sequence query(in); m->gobble(in);
+ if (query.getName() != "") {
+
+ it = seqGroup.find(query.getName());
+
+ //save names in case no namefile is given
+ if (namefile == "") { names.insert(query.getName()); }
+
+ if (it != seqGroup.end()) { //not singleton
+ m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
+ query.printSequence(outFile);
+ outFile.close();
+
+ copyGroups.erase(query.getName());
+ }
+ }
+ }
+ in.close();
+
+ //warn about sequence in groups that are not in fasta file
+ for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
+ m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
+ exit(1);
+ }
+
+ copyGroups.clear();
+
+ //process each distance file
+ for (int i = 0; i < numGroups; i++) {
+
+ string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff);
+ if (outputDir != "") { options += ", outputdir=" + outputDir; }
+
+ Command* command = new DistanceCommand(options);
+
+ command->execute();
+ delete command;
+
+ m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
+
+ //remove old names files just in case
+ m->mothurRemove((namefile + "." + toString(i) + ".temp"));
+ }
+
+ singleton = namefile + ".extra.temp";
+ ofstream remainingNames;
+ m->openOutputFile(singleton, remainingNames);
+
+ bool wroteExtra = false;
+
+ ifstream bigNameFile;
+ m->openInputFile(namefile, bigNameFile);
+
+ string name, nameList;
+ while(!bigNameFile.eof()){
+ bigNameFile >> name >> nameList; m->gobble(bigNameFile);
+
+ //did this sequence get assigned a group
+ it = seqGroup.find(name);
+
+ if (it != seqGroup.end()) {
+ m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
+ outFile << name << '\t' << nameList << endl;
+ outFile.close();
+ }else{
+ wroteExtra = true;
+ remainingNames << name << '\t' << nameList << endl;
+ }
+ }
+ bigNameFile.close();
+
+ for(int i=0;i<numGroups;i++){
+ string tempNameFile = namefile + "." + toString(i) + ".temp";
+ if (outputDir == "") { outputDir = m->hasPath(fastafile); }
+ string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
+
+ //if there are valid distances
+ ifstream fileHandle;
+ fileHandle.open(tempDistFile.c_str());
+ if(fileHandle) {
+ m->gobble(fileHandle);
+ if (!fileHandle.eof()) { //check for blank file - this could occur if all dists in group are above cutoff
+ map<string, string> temp;
+ temp[tempDistFile] = tempNameFile;
+ dists.push_back(temp);
+ }else {
+ ifstream in;
+ m->openInputFile(tempNameFile, in);
+
+ while(!in.eof()) {
+ in >> name >> nameList; m->gobble(in);
+ wroteExtra = true;
+ remainingNames << name << '\t' << nameList << endl;
+ }
+ in.close();
+ m->mothurRemove(tempNameFile);
+ }
+ }
+ fileHandle.close();
+ }
+
+ remainingNames.close();
+ if (!wroteExtra) {
+ m->mothurRemove(singleton);
+ singleton = "none";
+ }
+
+ if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
+ exit(1);
+ }
+}
+/***********************************************************************/
+int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
+ try {
+ map<string, int>::iterator it;
+ map<string, int>::iterator it2;
+
+ ifstream dFile;
+ m->openInputFile(distFile, dFile);
+ ofstream outFile;
+
+ for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+ m->mothurRemove((distFile + "." + toString(i) + ".temp"));
+ }
//for buffering the io to improve speed
//allow for 10 dists to be stored, then output.
+ vector<string> outputs; outputs.resize(numGroups, "");
+ vector<int> numOutputs; numOutputs.resize(numGroups, 0);
+
+ //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
+ //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
+ vector<bool> validDistances; validDistances.resize(numGroups, false);
+
+ //for each distance
+ while(dFile){
+ string seqA, seqB;
+ float dist;
+
+ if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp")); } }
+
+ dFile >> seqA >> seqB >> dist; m->gobble(dFile);
+
+ //if both sequences are in the same group then they are within the cutoff
+ it = seqGroup.find(seqA);
+ it2 = seqGroup.find(seqB);
+
+ if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
+ if (it->second == it2->second) { //they are from the same group so add the distance
+ if (numOutputs[it->second] > 30) {
+ m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
+ outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
+ outFile.close();
+ outputs[it->second] = "";
+ numOutputs[it->second] = 0;
+ validDistances[it->second] = true;
+ }else{
+ outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
+ numOutputs[it->second]++;
+ }
+ }
+ }
+ }
+ dFile.close();
+
+ for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+ m->mothurRemove((namefile + "." + toString(i) + ".temp"));
+
+ //write out any remaining buffers
+ if (numOutputs[i] > 0) {
+ m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
+ outFile << outputs[i];
+ outFile.close();
+ outputs[i] = "";
+ numOutputs[i] = 0;
+ validDistances[i] = true;
+ }
+ }
+
+ ifstream bigNameFile;
+ m->openInputFile(namefile, bigNameFile);
+
+ singleton = namefile + ".extra.temp";
+ ofstream remainingNames;
+ m->openOutputFile(singleton, remainingNames);
+
+ bool wroteExtra = false;
+
+ string name, nameList;
+ while(!bigNameFile.eof()){
+ bigNameFile >> name >> nameList; m->gobble(bigNameFile);
+
+ //did this sequence get assigned a group
+ it = seqGroup.find(name);
+
+ if (it != seqGroup.end()) {
+ m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
+ outFile << name << '\t' << nameList << endl;
+ outFile.close();
+ }else{
+ wroteExtra = true;
+ remainingNames << name << '\t' << nameList << endl;
+ }
+ }
+ bigNameFile.close();
+
+ for(int i=0;i<numGroups;i++){
+ string tempNameFile = namefile + "." + toString(i) + ".temp";
+ string tempDistFile = distFile + "." + toString(i) + ".temp";
+
+ //if there are valid distances
+ if (validDistances[i]) {
+ map<string, string> temp;
+ temp[tempDistFile] = tempNameFile;
+ dists.push_back(temp);
+ }else{
+ ifstream in;
+ m->openInputFile(tempNameFile, in);
+
+ while(!in.eof()) {
+ in >> name >> nameList; m->gobble(in);
+ wroteExtra = true;
+ remainingNames << name << '\t' << nameList << endl;
+ }
+ in.close();
+ m->mothurRemove(tempNameFile);
+ }
+ }
+
+ remainingNames.close();
+
+ if (!wroteExtra) {
+ m->mothurRemove(singleton);
+ singleton = "none";
+ }
+
+ if (m->control_pressed) {
+ for (int i = 0; i < dists.size(); i++) {
+ m->mothurRemove((dists[i].begin()->first));
+ m->mothurRemove((dists[i].begin()->second));
+ }
+ dists.clear();
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
+ exit(1);
+ }
+}
+/***********************************************************************/
+int SplitMatrix::splitDistanceLarge(){
+ try {
+ vector<set<string> > groups;
+
+ //for buffering the io to improve speed
+ //allow for 30 dists to be stored, then output.
vector<string> outputs;
vector<int> numOutputs;
vector<bool> wroteOutPut;
ofstream outFile;
ifstream dFile;
- openInputFile(distFile, dFile);
+ m->openInputFile(distFile, dFile);
while(dFile){
string seqA, seqB;
dFile >> seqA >> seqB >> dist;
- if (m->control_pressed) { outFile.close(); dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
+ if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; }
if(dist < cutoff){
//cout << "in cutoff: " << dist << endl;
int groupIDA = -1;
int groupIDB = -1;
int groupID = -1;
- int prevGroupID = -1;
for(int i=0;i<numGroups;i++){
set<string>::iterator aIt = groups[i].find(seqA);
newGroup.insert(seqB);
groups.push_back(newGroup);
- outFile.close();
- string fileName = distFile + "." + toString(numGroups) + ".temp";
- outFile.open(fileName.c_str(), ios::ate);
-
string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
outputs.push_back(tempOut);
numOutputs.push_back(1);
}
else{
string fileName = distFile + "." + toString(groupID) + ".temp";
-
- if(groupID != prevGroupID){
- outFile.close();
- outFile.open(fileName.c_str(), ios::app);
- prevGroupID = groupID;
- }
-
+
//have we reached the max buffer size
- if (numOutputs[groupID] > 10) { //write out sequence
+ if (numOutputs[groupID] > 60) { //write out sequence
+ outFile.open(fileName.c_str(), ios::app);
outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
+ outFile.close();
+
outputs[groupID] = "";
numOutputs[groupID] = 0;
wroteOutPut[groupID] = true;
if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
string row, column, distance;
if(groupIDA<groupIDB){
-
+
+ //merge memory
numOutputs[groupID] += numOutputs[groupIDB];
outputs[groupID] += outputs[groupIDB];
+ outputs[groupIDB] = "";
+ numOutputs[groupIDB] = 0;
+
+ //if groupB is written to file it is above buffer size so read and write to new merged file
if (wroteOutPut[groupIDB]) {
- string fileName = distFile + "." + toString(groupIDB) + ".temp";
- ifstream fileB(fileName.c_str(), ios::ate);
+ string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
+ ifstream fileB(fileName2.c_str(), ios::ate);
+
+ outFile.open(fileName.c_str(), ios::app);
long size;
char* memblock;
delete memblock;
fileB.close();
- remove(fileName.c_str());
+ m->mothurRemove(fileName2);
+
+ //write out the merged memory
+ if (numOutputs[groupID] > 60) {
+ outFile << outputs[groupID];
+ outputs[groupID] = "";
+ numOutputs[groupID] = 0;
+ }
+
+ outFile.close();
wroteOutPut[groupID] = true;
wroteOutPut[groupIDB] = false;
- }
-
- if (numOutputs[groupID] != 0) {
- outFile << outputs[groupID];
- wroteOutPut[groupID] = true;
- outputs[groupID] = "";
- numOutputs[groupID] = 0;
-
- outputs[groupIDB] = "";
- numOutputs[groupIDB] = 0;
- }
-
+ }else{ } //just merge b's memory with a's memory
}
else{
numOutputs[groupID] += numOutputs[groupIDA];
outputs[groupID] += outputs[groupIDA];
+ outputs[groupIDA] = "";
+ numOutputs[groupIDA] = 0;
+
if (wroteOutPut[groupIDA]) {
- string fileName = distFile + "." + toString(groupIDA) + ".temp";
- ifstream fileB(fileName.c_str(), ios::ate);
+ string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
+ ifstream fileB(fileName2.c_str(), ios::ate);
+
+ outFile.open(fileName.c_str(), ios::app);
long size;
char* memblock;
delete memblock;
fileB.close();
- remove(fileName.c_str());
+ m->mothurRemove(fileName2);
+
+ //write out the merged memory
+ if (numOutputs[groupID] > 60) {
+ outFile << outputs[groupID];
+ outputs[groupID] = "";
+ numOutputs[groupID] = 0;
+ }
+
+ outFile.close();
wroteOutPut[groupID] = true;
wroteOutPut[groupIDA] = false;
- }
-
- if (numOutputs[groupID] != 0) {
- outFile << outputs[groupID];
- wroteOutPut[groupID] = true;
- outputs[groupID] = "";
- numOutputs[groupID] = 0;
-
- outputs[groupIDA] = "";
- numOutputs[groupIDA] = 0;
- }
-
+ }else { } //just merge memory
}
}
}
}
- gobble(dFile);
+ m->gobble(dFile);
}
- outFile.close();
dFile.close();
for (int i = 0; i < numGroups; i++) {
outFile.close();
}
}
+
+ splitNames(groups);
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
+ exit(1);
+ }
+}
+//********************************************************************************************************************
+int SplitMatrix::splitNames(vector<set<string> >& groups){
+ try {
+ int numGroups = groups.size();
ifstream bigNameFile(namefile.c_str());
if(!bigNameFile){
while(bigNameFile){
bigNameFile >> name >> nameList;
nameMap[name] = nameList;
- gobble(bigNameFile);
+ m->gobble(bigNameFile);
}
bigNameFile.close();
if (m->control_pressed) {
for (int i = 0; i < dists.size(); i++) {
- remove((dists[i].begin()->first).c_str());
- remove((dists[i].begin()->second).c_str());
+ m->mothurRemove((dists[i].begin()->first));
+ m->mothurRemove((dists[i].begin()->second));
}
dists.clear();
}
return 0;
-
}
catch(exception& e) {
- m->errorOut(e, "SplitMatrix", "splitDistance");
+ m->errorOut(e, "SplitMatrix", "splitNames");
exit(1);
}
}
-
-/***********************************************************************/
-int SplitMatrix::splitClassify(){
+//********************************************************************************************************************
+int SplitMatrix::splitDistanceRAM(){
try {
- cutoff = int(cutoff);
-
- map<string, int> seqGroup;
- map<string, int>::iterator it;
- map<string, int>::iterator it2;
+ vector<set<string> > groups;
+ vector<string> outputs;
int numGroups = 0;
-
- //build tree from users taxonomy file
- PhyloTree* phylo = new PhyloTree();
-
- ifstream in;
- openInputFile(taxFile, in);
-
- //read in users taxonomy file and add sequences to tree
- string seqname, tax;
- while(!in.eof()){
- in >> seqname >> tax; gobble(in);
-
- phylo->addSeqToTree(seqname, tax);
- }
- in.close();
-
- phylo->assignHeirarchyIDs(0);
-
- //make sure the cutoff is not greater than maxlevel
- if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
-
- //for each node in tree
- for (int i = 0; i < phylo->getNumNodes(); i++) {
-
- //is this node within the cutoff
- TaxNode taxon = phylo->get(i);
-
- if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
- if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
- for (int j = 0; j < taxon.accessions.size(); j++) {
- seqGroup[taxon.accessions[j]] = numGroups;
- }
- numGroups++;
- }
- }
- }
ifstream dFile;
- openInputFile(distFile, dFile);
- ofstream outFile;
-
- for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
- remove((distFile + "." + toString(i) + ".temp").c_str());
- }
-
-
- //for buffering the io to improve speed
- //allow for 10 dists to be stored, then output.
- vector<string> outputs; outputs.resize(numGroups, "");
- vector<int> numOutputs; numOutputs.resize(numGroups, 0);
-
- //for each distance
+ m->openInputFile(distFile, dFile);
+
while(dFile){
string seqA, seqB;
float dist;
+
+ dFile >> seqA >> seqB >> dist;
- if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } }
-
- dFile >> seqA >> seqB >> dist; gobble(dFile);
-
- //if both sequences are in the same group then they are within the cutoff
- it = seqGroup.find(seqA);
- it2 = seqGroup.find(seqB);
-
- if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
- if (it->second == it2->second) { //they are from the same group so add the distance
- if (numOutputs[it->second] > 10) {
- openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
- outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
- outFile.close();
- outputs[it->second] = "";
- numOutputs[it->second] = 0;
- }else{
- outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
- numOutputs[it->second]++;
+ if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; }
+
+ if(dist < cutoff){
+ //cout << "in cutoff: " << dist << endl;
+ int groupIDA = -1;
+ int groupIDB = -1;
+ int groupID = -1;
+
+ for(int i=0;i<numGroups;i++){
+ set<string>::iterator aIt = groups[i].find(seqA);
+ set<string>::iterator bIt = groups[i].find(seqB);
+
+ if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
+ groups[i].insert(seqB);
+ groupIDA = i;
+ groupID = groupIDA;
+
+ //cout << "in aIt: " << groupID << endl;
+ // break;
+ }
+ else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
+ groups[i].insert(seqA);
+ groupIDB = i;
+ groupID = groupIDB;
+
+ // cout << "in bIt: " << groupID << endl;
+ // break;
+ }
+
+ if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
+ if(groupIDA < groupIDB){
+ // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
+ groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
+ groups[groupIDB].clear();
+ groupID = groupIDA;
+ }
+ else{
+ // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
+ groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
+ groups[groupIDA].clear();
+ groupID = groupIDB;
+ }
+ break;
+ }
+ }
+
+ //windows is gonna gag on the reuse of outFile, will need to make it local...
+
+ if(groupIDA == -1 && groupIDB == -1){ //we need a new group
+ set<string> newGroup;
+ newGroup.insert(seqA);
+ newGroup.insert(seqB);
+ groups.push_back(newGroup);
+
+ string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
+ outputs.push_back(tempOut);
+ numGroups++;
+ }
+ else{
+
+ outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
+
+ if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
+ string row, column, distance;
+ if(groupIDA<groupIDB){
+ //merge memory
+ outputs[groupID] += outputs[groupIDB];
+ outputs[groupIDB] = "";
+ }else{
+ outputs[groupID] += outputs[groupIDA];
+ outputs[groupIDA] = "";
+ }
}
}
}
+ m->gobble(dFile);
}
dFile.close();
-
- for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
- remove((namefile + "." + toString(i) + ".temp").c_str());
-
- //write out any remaining buffers
- if (numOutputs[it->second] > 0) {
- openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
- outFile << outputs[i];
- outFile.close();
- outputs[i] = "";
- numOutputs[i] = 0;
- }
- }
- ifstream bigNameFile;
- openInputFile(namefile, bigNameFile);
-
- singleton = namefile + ".extra.temp";
- ofstream remainingNames;
- openOutputFile(singleton, remainingNames);
-
- bool wroteExtra = false;
-
- string name, nameList;
- while(!bigNameFile.eof()){
- bigNameFile >> name >> nameList; gobble(bigNameFile);
-
- //did this sequence get assigned a group
- it = seqGroup.find(name);
-
- if (it != seqGroup.end()) {
- openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
- outFile << name << '\t' << nameList << endl;
+ for (int i = 0; i < numGroups; i++) {
+ if (outputs[i] != "") {
+ ofstream outFile;
+ string fileName = distFile + "." + toString(i) + ".temp";
+ outFile.open(fileName.c_str(), ios::ate);
+ outFile << outputs[i];
outFile.close();
- }else{
- wroteExtra = true;
- remainingNames << name << '\t' << nameList << endl;
}
}
- bigNameFile.close();
- remainingNames.close();
-
- if (!wroteExtra) {
- remove(singleton.c_str());
- singleton = "none";
- }
-
- for(int i=0;i<numGroups;i++){
- string tempNameFile = namefile + "." + toString(i) + ".temp";
- string tempDistFile = distFile + "." + toString(i) + ".temp";
+
+ splitNames(groups);
- map<string, string> temp;
- temp[tempDistFile] = tempNameFile;
- dists.push_back(temp);
- }
-
- if (m->control_pressed) {
- for (int i = 0; i < dists.size(); i++) {
- remove((dists[i].begin()->first).c_str());
- remove((dists[i].begin()->second).c_str());
- }
- dists.clear();
- }
-
- return 0;
-
+ return 0;
}
catch(exception& e) {
- m->errorOut(e, "SplitMatrix", "splitClassify");
+ m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
exit(1);
}
}