5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "sequencedb.h"
13 #include "onegapdist.h"
16 /***********************************************************************/
18 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
19 m = MothurOut::getInstance();
27 /***********************************************************************/
29 SplitMatrix::SplitMatrix(string ffile, string tax, float c, string t){
30 m = MothurOut::getInstance();
37 /***********************************************************************/
39 int SplitMatrix::split(){
42 if (method == "distance") {
44 }else if ((method == "classify") || (method == "fasta")) {
47 m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
48 map<string, string> temp;
49 temp[distFile] = namefile;
50 dists.push_back(temp);
56 m->errorOut(e, "SplitMatrix", "split");
60 /***********************************************************************/
61 int SplitMatrix::splitDistance(){
64 if (large) { splitDistanceLarge(); }
65 else { splitDistanceRAM(); }
69 m->errorOut(e, "SplitMatrix", "splitDistance");
74 /***********************************************************************/
75 int SplitMatrix::splitClassify(){
79 map<string, int> seqGroup;
80 map<string, int>::iterator it;
81 map<string, int>::iterator it2;
85 //build tree from users taxonomy file
86 PhyloTree* phylo = new PhyloTree();
89 openInputFile(taxFile, in);
91 //read in users taxonomy file and add sequences to tree
94 in >> seqname >> tax; gobble(in);
95 phylo->addSeqToTree(seqname, tax);
99 phylo->assignHeirarchyIDs(0);
101 //make sure the cutoff is not greater than maxlevel
102 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
104 //for each node in tree
105 for (int i = 0; i < phylo->getNumNodes(); i++) {
107 //is this node within the cutoff
108 TaxNode taxon = phylo->get(i);
110 if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
111 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
112 for (int j = 0; j < taxon.accessions.size(); j++) {
113 seqGroup[taxon.accessions[j]] = numGroups;
122 if (method == "classify") {
123 splitDistanceFileByTax(seqGroup, numGroups);
125 createDistanceFilesFromTax(seqGroup, numGroups);
132 catch(exception& e) {
133 m->errorOut(e, "SplitMatrix", "splitClassify");
137 /***********************************************************************/
138 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
140 map<string, int>::iterator it;
141 map<string, int>::iterator it2;
142 map<string, int> seqIndexInFasta;
148 openInputFile(fastafile, filehandle);
150 while (!filehandle.eof()) {
151 //input sequence info into sequencedb
152 Sequence newSequence(filehandle);
154 if (newSequence.getName() != "") {
155 alignDB.push_back(newSequence);
156 seqIndexInFasta[newSequence.getName()] = numSeqs;
160 //takes care of white space
165 Dist* distCalculator = new oneGapDist();
173 catch(exception& e) {
174 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
178 /***********************************************************************/
179 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
181 map<string, int>::iterator it;
182 map<string, int>::iterator it2;
185 openInputFile(distFile, dFile);
188 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
189 remove((distFile + "." + toString(i) + ".temp").c_str());
192 //for buffering the io to improve speed
193 //allow for 10 dists to be stored, then output.
194 vector<string> outputs; outputs.resize(numGroups, "");
195 vector<int> numOutputs; numOutputs.resize(numGroups, 0);
197 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
198 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
199 vector<bool> validDistances; validDistances.resize(numGroups, false);
206 if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } }
208 dFile >> seqA >> seqB >> dist; gobble(dFile);
210 //if both sequences are in the same group then they are within the cutoff
211 it = seqGroup.find(seqA);
212 it2 = seqGroup.find(seqB);
214 if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
215 if (it->second == it2->second) { //they are from the same group so add the distance
216 if (numOutputs[it->second] > 30) {
217 openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
218 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
220 outputs[it->second] = "";
221 numOutputs[it->second] = 0;
222 validDistances[it->second] = true;
224 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
225 numOutputs[it->second]++;
232 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
233 remove((namefile + "." + toString(i) + ".temp").c_str());
235 //write out any remaining buffers
236 if (numOutputs[i] > 0) {
237 openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
238 outFile << outputs[i];
242 validDistances[i] = true;
246 ifstream bigNameFile;
247 openInputFile(namefile, bigNameFile);
249 singleton = namefile + ".extra.temp";
250 ofstream remainingNames;
251 openOutputFile(singleton, remainingNames);
253 bool wroteExtra = false;
255 string name, nameList;
256 while(!bigNameFile.eof()){
257 bigNameFile >> name >> nameList; gobble(bigNameFile);
259 //did this sequence get assigned a group
260 it = seqGroup.find(name);
262 if (it != seqGroup.end()) {
263 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
264 outFile << name << '\t' << nameList << endl;
268 remainingNames << name << '\t' << nameList << endl;
272 remainingNames.close();
275 remove(singleton.c_str());
279 for(int i=0;i<numGroups;i++){
280 //if there are valid distances
281 if (validDistances[i]) {
282 string tempNameFile = namefile + "." + toString(i) + ".temp";
283 string tempDistFile = distFile + "." + toString(i) + ".temp";
285 map<string, string> temp;
286 temp[tempDistFile] = tempNameFile;
287 dists.push_back(temp);
291 if (m->control_pressed) {
292 for (int i = 0; i < dists.size(); i++) {
293 remove((dists[i].begin()->first).c_str());
294 remove((dists[i].begin()->second).c_str());
301 catch(exception& e) {
302 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
306 /***********************************************************************/
307 int SplitMatrix::splitDistanceLarge(){
309 vector<set<string> > groups;
311 //for buffering the io to improve speed
312 //allow for 30 dists to be stored, then output.
313 vector<string> outputs;
314 vector<int> numOutputs;
315 vector<bool> wroteOutPut;
321 openInputFile(distFile, dFile);
327 dFile >> seqA >> seqB >> dist;
329 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
332 //cout << "in cutoff: " << dist << endl;
337 for(int i=0;i<numGroups;i++){
338 set<string>::iterator aIt = groups[i].find(seqA);
339 set<string>::iterator bIt = groups[i].find(seqB);
341 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
342 groups[i].insert(seqB);
346 //cout << "in aIt: " << groupID << endl;
349 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
350 groups[i].insert(seqA);
354 // cout << "in bIt: " << groupID << endl;
358 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
359 if(groupIDA < groupIDB){
360 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
361 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
362 groups[groupIDB].clear();
366 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
367 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
368 groups[groupIDA].clear();
375 //windows is gonna gag on the reuse of outFile, will need to make it local...
377 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
378 set<string> newGroup;
379 newGroup.insert(seqA);
380 newGroup.insert(seqB);
381 groups.push_back(newGroup);
383 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
384 outputs.push_back(tempOut);
385 numOutputs.push_back(1);
386 wroteOutPut.push_back(false);
391 string fileName = distFile + "." + toString(groupID) + ".temp";
393 //have we reached the max buffer size
394 if (numOutputs[groupID] > 60) { //write out sequence
395 outFile.open(fileName.c_str(), ios::app);
396 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
399 outputs[groupID] = "";
400 numOutputs[groupID] = 0;
401 wroteOutPut[groupID] = true;
403 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
404 numOutputs[groupID]++;
407 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
408 string row, column, distance;
409 if(groupIDA<groupIDB){
412 numOutputs[groupID] += numOutputs[groupIDB];
413 outputs[groupID] += outputs[groupIDB];
415 outputs[groupIDB] = "";
416 numOutputs[groupIDB] = 0;
418 //if groupB is written to file it is above buffer size so read and write to new merged file
419 if (wroteOutPut[groupIDB]) {
420 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
421 ifstream fileB(fileName2.c_str(), ios::ate);
423 outFile.open(fileName.c_str(), ios::app);
428 size = fileB.tellg();
430 fileB.seekg (0, ios::beg);
432 int numRead = size / 1024;
433 int lastRead = size % 1024;
435 for (int i = 0; i < numRead; i++) {
437 memblock = new char [1024];
439 fileB.read (memblock, 1024);
441 string temp = memblock;
442 outFile << temp.substr(0, 1024);
447 memblock = new char [lastRead];
449 fileB.read (memblock, lastRead);
451 //not sure why but it will read more than lastRead char...??
452 string temp = memblock;
453 outFile << temp.substr(0, lastRead);
457 remove(fileName2.c_str());
459 //write out the merged memory
460 if (numOutputs[groupID] > 60) {
461 outFile << outputs[groupID];
462 outputs[groupID] = "";
463 numOutputs[groupID] = 0;
468 wroteOutPut[groupID] = true;
469 wroteOutPut[groupIDB] = false;
470 }else{ } //just merge b's memory with a's memory
473 numOutputs[groupID] += numOutputs[groupIDA];
474 outputs[groupID] += outputs[groupIDA];
476 outputs[groupIDA] = "";
477 numOutputs[groupIDA] = 0;
479 if (wroteOutPut[groupIDA]) {
480 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
481 ifstream fileB(fileName2.c_str(), ios::ate);
483 outFile.open(fileName.c_str(), ios::app);
488 size = fileB.tellg();
490 fileB.seekg (0, ios::beg);
492 int numRead = size / 1024;
493 int lastRead = size % 1024;
495 for (int i = 0; i < numRead; i++) {
497 memblock = new char [1024];
499 fileB.read (memblock, 1024);
500 string temp = memblock;
501 outFile << temp.substr(0, 1024);
506 memblock = new char [lastRead];
508 fileB.read (memblock, lastRead);
510 //not sure why but it will read more than lastRead char...??
511 string temp = memblock;
512 outFile << temp.substr(0, lastRead);
517 remove(fileName2.c_str());
519 //write out the merged memory
520 if (numOutputs[groupID] > 60) {
521 outFile << outputs[groupID];
522 outputs[groupID] = "";
523 numOutputs[groupID] = 0;
528 wroteOutPut[groupID] = true;
529 wroteOutPut[groupIDA] = false;
530 }else { } //just merge memory
539 for (int i = 0; i < numGroups; i++) {
540 if (numOutputs[i] > 0) {
541 string fileName = distFile + "." + toString(i) + ".temp";
542 outFile.open(fileName.c_str(), ios::app);
543 outFile << outputs[i];
552 catch(exception& e) {
553 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
557 //********************************************************************************************************************
558 int SplitMatrix::splitNames(vector<set<string> >& groups){
560 int numGroups = groups.size();
562 ifstream bigNameFile(namefile.c_str());
564 cerr << "Error: We can't open the name file\n";
568 map<string, string> nameMap;
569 string name, nameList;
571 bigNameFile >> name >> nameList;
572 nameMap[name] = nameList;
577 for(int i=0;i<numGroups;i++){ //parse names file to match distance files
578 int numSeqsInGroup = groups[i].size();
580 if(numSeqsInGroup > 0){
581 string fileName = namefile + "." + toString(i) + ".temp";
582 ofstream smallNameFile(fileName.c_str(), ios::ate);
584 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
585 map<string,string>::iterator nIt = nameMap.find(*gIt);
586 if (nIt != nameMap.end()) {
587 smallNameFile << nIt->first << '\t' << nIt->second << endl;
590 m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1);
593 smallNameFile.close();
597 //names of singletons
598 if (nameMap.size() != 0) {
599 singleton = namefile + ".extra.temp";
600 ofstream remainingNames(singleton.c_str(), ios::ate);
601 for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
602 remainingNames << nIt->first << '\t' << nIt->second << endl;
604 remainingNames.close();
605 }else { singleton = "none"; }
607 for(int i=0;i<numGroups;i++){
608 if(groups[i].size() > 0){
609 string tempNameFile = namefile + "." + toString(i) + ".temp";
610 string tempDistFile = distFile + "." + toString(i) + ".temp";
612 map<string, string> temp;
613 temp[tempDistFile] = tempNameFile;
614 dists.push_back(temp);
618 if (m->control_pressed) {
619 for (int i = 0; i < dists.size(); i++) {
620 remove((dists[i].begin()->first).c_str());
621 remove((dists[i].begin()->second).c_str());
628 catch(exception& e) {
629 m->errorOut(e, "SplitMatrix", "splitNames");
633 //********************************************************************************************************************
634 int SplitMatrix::splitDistanceRAM(){
636 vector<set<string> > groups;
637 vector<string> outputs;
642 openInputFile(distFile, dFile);
648 dFile >> seqA >> seqB >> dist;
650 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
653 //cout << "in cutoff: " << dist << endl;
658 for(int i=0;i<numGroups;i++){
659 set<string>::iterator aIt = groups[i].find(seqA);
660 set<string>::iterator bIt = groups[i].find(seqB);
662 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
663 groups[i].insert(seqB);
667 //cout << "in aIt: " << groupID << endl;
670 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
671 groups[i].insert(seqA);
675 // cout << "in bIt: " << groupID << endl;
679 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
680 if(groupIDA < groupIDB){
681 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
682 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
683 groups[groupIDB].clear();
687 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
688 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
689 groups[groupIDA].clear();
696 //windows is gonna gag on the reuse of outFile, will need to make it local...
698 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
699 set<string> newGroup;
700 newGroup.insert(seqA);
701 newGroup.insert(seqB);
702 groups.push_back(newGroup);
704 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
705 outputs.push_back(tempOut);
710 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
712 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
713 string row, column, distance;
714 if(groupIDA<groupIDB){
716 outputs[groupID] += outputs[groupIDB];
717 outputs[groupIDB] = "";
719 outputs[groupID] += outputs[groupIDA];
720 outputs[groupIDA] = "";
729 for (int i = 0; i < numGroups; i++) {
730 if (outputs[i] != "") {
732 string fileName = distFile + "." + toString(i) + ".temp";
733 outFile.open(fileName.c_str(), ios::ate);
734 outFile << outputs[i];
743 catch(exception& e) {
744 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
748 //********************************************************************************************************************
749 //sorts biggest to smallest
750 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
755 //get num bytes in file
756 string filename = left.begin()->first;
757 pFile = fopen (filename.c_str(),"rb");
758 string error = "Error opening " + filename;
759 if (pFile==NULL) perror (error.c_str());
761 fseek (pFile, 0, SEEK_END);
762 leftsize=ftell (pFile);
769 //get num bytes in file
770 filename = right.begin()->first;
771 pFile2 = fopen (filename.c_str(),"rb");
772 error = "Error opening " + filename;
773 if (pFile2==NULL) perror (error.c_str());
775 fseek (pFile2, 0, SEEK_END);
776 rightsize=ftell (pFile2);
780 return (leftsize > rightsize);
782 /***********************************************************************/
783 //returns map of distance files -> namefile sorted by distance file size
784 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
787 sort(dists.begin(), dists.end(), compareFileSizes);
791 catch(exception& e) {
792 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
796 /***********************************************************************/
797 SplitMatrix::~SplitMatrix(){}
798 /***********************************************************************/