5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "splitmatrix.h"
11 #include "phylotree.h"
13 /***********************************************************************/
15 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
16 m = MothurOut::getInstance();
25 /***********************************************************************/
27 int SplitMatrix::split(){
30 if (method == "distance") {
32 }else if (method == "classify") {
35 m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
36 map<string, string> temp;
37 temp[distFile] = namefile;
38 dists.push_back(temp);
44 m->errorOut(e, "SplitMatrix", "split");
48 /***********************************************************************/
49 int SplitMatrix::splitDistance(){
52 if (large) { splitDistanceLarge(); }
53 else { splitDistanceRAM(); }
57 m->errorOut(e, "SplitMatrix", "splitDistance");
62 /***********************************************************************/
63 int SplitMatrix::splitClassify(){
67 map<string, int> seqGroup;
68 map<string, int>::iterator it;
69 map<string, int>::iterator it2;
73 //build tree from users taxonomy file
74 PhyloTree* phylo = new PhyloTree();
77 openInputFile(taxFile, in);
79 //read in users taxonomy file and add sequences to tree
82 in >> seqname >> tax; gobble(in);
84 phylo->addSeqToTree(seqname, tax);
88 phylo->assignHeirarchyIDs(0);
90 //make sure the cutoff is not greater than maxlevel
91 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
93 //for each node in tree
94 for (int i = 0; i < phylo->getNumNodes(); i++) {
96 //is this node within the cutoff
97 TaxNode taxon = phylo->get(i);
99 if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
100 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
101 for (int j = 0; j < taxon.accessions.size(); j++) {
102 seqGroup[taxon.accessions[j]] = numGroups;
110 openInputFile(distFile, dFile);
113 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
114 remove((distFile + "." + toString(i) + ".temp").c_str());
118 //for buffering the io to improve speed
119 //allow for 10 dists to be stored, then output.
120 vector<string> outputs; outputs.resize(numGroups, "");
121 vector<int> numOutputs; numOutputs.resize(numGroups, 0);
128 if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } }
130 dFile >> seqA >> seqB >> dist; gobble(dFile);
132 //if both sequences are in the same group then they are within the cutoff
133 it = seqGroup.find(seqA);
134 it2 = seqGroup.find(seqB);
136 if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
137 if (it->second == it2->second) { //they are from the same group so add the distance
138 if (numOutputs[it->second] > 10) {
139 openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
140 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
142 outputs[it->second] = "";
143 numOutputs[it->second] = 0;
145 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
146 numOutputs[it->second]++;
153 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
154 remove((namefile + "." + toString(i) + ".temp").c_str());
156 //write out any remaining buffers
157 if (numOutputs[it->second] > 0) {
158 openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
159 outFile << outputs[i];
166 ifstream bigNameFile;
167 openInputFile(namefile, bigNameFile);
169 singleton = namefile + ".extra.temp";
170 ofstream remainingNames;
171 openOutputFile(singleton, remainingNames);
173 bool wroteExtra = false;
175 string name, nameList;
176 while(!bigNameFile.eof()){
177 bigNameFile >> name >> nameList; gobble(bigNameFile);
179 //did this sequence get assigned a group
180 it = seqGroup.find(name);
182 if (it != seqGroup.end()) {
183 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
184 outFile << name << '\t' << nameList << endl;
188 remainingNames << name << '\t' << nameList << endl;
192 remainingNames.close();
195 remove(singleton.c_str());
199 for(int i=0;i<numGroups;i++){
200 string tempNameFile = namefile + "." + toString(i) + ".temp";
201 string tempDistFile = distFile + "." + toString(i) + ".temp";
203 map<string, string> temp;
204 temp[tempDistFile] = tempNameFile;
205 dists.push_back(temp);
208 if (m->control_pressed) {
209 for (int i = 0; i < dists.size(); i++) {
210 remove((dists[i].begin()->first).c_str());
211 remove((dists[i].begin()->second).c_str());
219 catch(exception& e) {
220 m->errorOut(e, "SplitMatrix", "splitClassify");
224 /***********************************************************************/
225 int SplitMatrix::splitDistanceLarge(){
227 vector<set<string> > groups;
229 //for buffering the io to improve speed
230 //allow for 30 dists to be stored, then output.
231 vector<string> outputs;
232 vector<int> numOutputs;
233 vector<bool> wroteOutPut;
239 openInputFile(distFile, dFile);
245 dFile >> seqA >> seqB >> dist;
247 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
250 //cout << "in cutoff: " << dist << endl;
255 for(int i=0;i<numGroups;i++){
256 set<string>::iterator aIt = groups[i].find(seqA);
257 set<string>::iterator bIt = groups[i].find(seqB);
259 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
260 groups[i].insert(seqB);
264 //cout << "in aIt: " << groupID << endl;
267 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
268 groups[i].insert(seqA);
272 // cout << "in bIt: " << groupID << endl;
276 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
277 if(groupIDA < groupIDB){
278 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
279 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
280 groups[groupIDB].clear();
284 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
285 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
286 groups[groupIDA].clear();
293 //windows is gonna gag on the reuse of outFile, will need to make it local...
295 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
296 set<string> newGroup;
297 newGroup.insert(seqA);
298 newGroup.insert(seqB);
299 groups.push_back(newGroup);
301 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
302 outputs.push_back(tempOut);
303 numOutputs.push_back(1);
304 wroteOutPut.push_back(false);
309 string fileName = distFile + "." + toString(groupID) + ".temp";
311 //have we reached the max buffer size
312 if (numOutputs[groupID] > 60) { //write out sequence
313 outFile.open(fileName.c_str(), ios::app);
314 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
317 outputs[groupID] = "";
318 numOutputs[groupID] = 0;
319 wroteOutPut[groupID] = true;
321 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
322 numOutputs[groupID]++;
325 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
326 string row, column, distance;
327 if(groupIDA<groupIDB){
330 numOutputs[groupID] += numOutputs[groupIDB];
331 outputs[groupID] += outputs[groupIDB];
333 outputs[groupIDB] = "";
334 numOutputs[groupIDB] = 0;
336 //if groupB is written to file it is above buffer size so read and write to new merged file
337 if (wroteOutPut[groupIDB]) {
338 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
339 ifstream fileB(fileName2.c_str(), ios::ate);
341 outFile.open(fileName.c_str(), ios::app);
346 size = fileB.tellg();
348 fileB.seekg (0, ios::beg);
350 int numRead = size / 1024;
351 int lastRead = size % 1024;
353 for (int i = 0; i < numRead; i++) {
355 memblock = new char [1024];
357 fileB.read (memblock, 1024);
359 string temp = memblock;
360 outFile << temp.substr(0, 1024);
365 memblock = new char [lastRead];
367 fileB.read (memblock, lastRead);
369 //not sure why but it will read more than lastRead char...??
370 string temp = memblock;
371 outFile << temp.substr(0, lastRead);
375 remove(fileName2.c_str());
377 //write out the merged memory
378 if (numOutputs[groupID] > 60) {
379 outFile << outputs[groupID];
380 outputs[groupID] = "";
381 numOutputs[groupID] = 0;
386 wroteOutPut[groupID] = true;
387 wroteOutPut[groupIDB] = false;
388 }else{ } //just merge b's memory with a's memory
391 numOutputs[groupID] += numOutputs[groupIDA];
392 outputs[groupID] += outputs[groupIDA];
394 outputs[groupIDA] = "";
395 numOutputs[groupIDA] = 0;
397 if (wroteOutPut[groupIDA]) {
398 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
399 ifstream fileB(fileName2.c_str(), ios::ate);
401 outFile.open(fileName.c_str(), ios::app);
406 size = fileB.tellg();
408 fileB.seekg (0, ios::beg);
410 int numRead = size / 1024;
411 int lastRead = size % 1024;
413 for (int i = 0; i < numRead; i++) {
415 memblock = new char [1024];
417 fileB.read (memblock, 1024);
418 string temp = memblock;
419 outFile << temp.substr(0, 1024);
424 memblock = new char [lastRead];
426 fileB.read (memblock, lastRead);
428 //not sure why but it will read more than lastRead char...??
429 string temp = memblock;
430 outFile << temp.substr(0, lastRead);
435 remove(fileName2.c_str());
437 //write out the merged memory
438 if (numOutputs[groupID] > 60) {
439 outFile << outputs[groupID];
440 outputs[groupID] = "";
441 numOutputs[groupID] = 0;
446 wroteOutPut[groupID] = true;
447 wroteOutPut[groupIDA] = false;
448 }else { } //just merge memory
457 for (int i = 0; i < numGroups; i++) {
458 if (numOutputs[i] > 0) {
459 string fileName = distFile + "." + toString(i) + ".temp";
460 outFile.open(fileName.c_str(), ios::app);
461 outFile << outputs[i];
470 catch(exception& e) {
471 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
475 //********************************************************************************************************************
476 int SplitMatrix::splitNames(vector<set<string> >& groups){
478 int numGroups = groups.size();
480 ifstream bigNameFile(namefile.c_str());
482 cerr << "Error: We can't open the name file\n";
486 map<string, string> nameMap;
487 string name, nameList;
489 bigNameFile >> name >> nameList;
490 nameMap[name] = nameList;
495 for(int i=0;i<numGroups;i++){ //parse names file to match distance files
496 int numSeqsInGroup = groups[i].size();
498 if(numSeqsInGroup > 0){
499 string fileName = namefile + "." + toString(i) + ".temp";
500 ofstream smallNameFile(fileName.c_str(), ios::ate);
502 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
503 map<string,string>::iterator nIt = nameMap.find(*gIt);
504 if (nIt != nameMap.end()) {
505 smallNameFile << nIt->first << '\t' << nIt->second << endl;
508 m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1);
511 smallNameFile.close();
515 //names of singletons
516 if (nameMap.size() != 0) {
517 singleton = namefile + ".extra.temp";
518 ofstream remainingNames(singleton.c_str(), ios::ate);
519 for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
520 remainingNames << nIt->first << '\t' << nIt->second << endl;
522 remainingNames.close();
523 }else { singleton = "none"; }
525 for(int i=0;i<numGroups;i++){
526 if(groups[i].size() > 0){
527 string tempNameFile = namefile + "." + toString(i) + ".temp";
528 string tempDistFile = distFile + "." + toString(i) + ".temp";
530 map<string, string> temp;
531 temp[tempDistFile] = tempNameFile;
532 dists.push_back(temp);
536 if (m->control_pressed) {
537 for (int i = 0; i < dists.size(); i++) {
538 remove((dists[i].begin()->first).c_str());
539 remove((dists[i].begin()->second).c_str());
546 catch(exception& e) {
547 m->errorOut(e, "SplitMatrix", "splitNames");
551 //********************************************************************************************************************
552 int SplitMatrix::splitDistanceRAM(){
554 vector<set<string> > groups;
555 vector<string> outputs;
560 openInputFile(distFile, dFile);
566 dFile >> seqA >> seqB >> dist;
568 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
571 //cout << "in cutoff: " << dist << endl;
576 for(int i=0;i<numGroups;i++){
577 set<string>::iterator aIt = groups[i].find(seqA);
578 set<string>::iterator bIt = groups[i].find(seqB);
580 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
581 groups[i].insert(seqB);
585 //cout << "in aIt: " << groupID << endl;
588 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
589 groups[i].insert(seqA);
593 // cout << "in bIt: " << groupID << endl;
597 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
598 if(groupIDA < groupIDB){
599 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
600 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
601 groups[groupIDB].clear();
605 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
606 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
607 groups[groupIDA].clear();
614 //windows is gonna gag on the reuse of outFile, will need to make it local...
616 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
617 set<string> newGroup;
618 newGroup.insert(seqA);
619 newGroup.insert(seqB);
620 groups.push_back(newGroup);
622 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
623 outputs.push_back(tempOut);
628 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
630 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
631 string row, column, distance;
632 if(groupIDA<groupIDB){
634 outputs[groupID] += outputs[groupIDB];
635 outputs[groupIDB] = "";
637 outputs[groupID] += outputs[groupIDA];
638 outputs[groupIDA] = "";
647 for (int i = 0; i < numGroups; i++) {
648 if (outputs[i] != "") {
650 string fileName = distFile + "." + toString(i) + ".temp";
651 outFile.open(fileName.c_str(), ios::ate);
652 outFile << outputs[i];
661 catch(exception& e) {
662 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
666 //********************************************************************************************************************
667 //sorts biggest to smallest
668 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
673 //get num bytes in file
674 string filename = left.begin()->first;
675 pFile = fopen (filename.c_str(),"rb");
676 string error = "Error opening " + filename;
677 if (pFile==NULL) perror (error.c_str());
679 fseek (pFile, 0, SEEK_END);
680 leftsize=ftell (pFile);
687 //get num bytes in file
688 filename = right.begin()->first;
689 pFile2 = fopen (filename.c_str(),"rb");
690 error = "Error opening " + filename;
691 if (pFile2==NULL) perror (error.c_str());
693 fseek (pFile2, 0, SEEK_END);
694 rightsize=ftell (pFile2);
698 return (leftsize > rightsize);
700 /***********************************************************************/
701 //returns map of distance files -> namefile sorted by distance file size
702 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
705 sort(dists.begin(), dists.end(), compareFileSizes);
709 catch(exception& e) {
710 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
714 /***********************************************************************/
715 SplitMatrix::~SplitMatrix(){}
716 /***********************************************************************/