5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
14 /***********************************************************************/
16 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
17 m = MothurOut::getInstance();
25 /***********************************************************************/
27 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, string t, int p, string output){
28 m = MothurOut::getInstance();
38 /***********************************************************************/
40 int SplitMatrix::split(){
43 if (method == "distance") {
45 }else if ((method == "classify") || (method == "fasta")) {
48 m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
49 map<string, string> temp;
50 temp[distFile] = namefile;
51 dists.push_back(temp);
57 m->errorOut(e, "SplitMatrix", "split");
61 /***********************************************************************/
62 int SplitMatrix::splitDistance(){
65 if (large) { splitDistanceLarge(); }
66 else { splitDistanceRAM(); }
70 m->errorOut(e, "SplitMatrix", "splitDistance");
75 /***********************************************************************/
76 int SplitMatrix::splitClassify(){
80 map<string, int> seqGroup;
81 map<string, int>::iterator it;
82 map<string, int>::iterator it2;
86 //build tree from users taxonomy file
87 PhyloTree* phylo = new PhyloTree();
90 m->openInputFile(taxFile, in);
92 //read in users taxonomy file and add sequences to tree
95 in >> seqname >> tax; m->gobble(in);
96 phylo->addSeqToTree(seqname, tax);
100 phylo->assignHeirarchyIDs(0);
102 //make sure the cutoff is not greater than maxlevel
103 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
105 //for each node in tree
106 for (int i = 0; i < phylo->getNumNodes(); i++) {
108 //is this node within the cutoff
109 TaxNode taxon = phylo->get(i);
111 if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
112 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
113 for (int j = 0; j < taxon.accessions.size(); j++) {
114 seqGroup[taxon.accessions[j]] = numGroups;
123 if (method == "classify") {
124 splitDistanceFileByTax(seqGroup, numGroups);
126 createDistanceFilesFromTax(seqGroup, numGroups);
132 catch(exception& e) {
133 m->errorOut(e, "SplitMatrix", "splitClassify");
137 /***********************************************************************/
138 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
140 map<string, int> copyGroups = seqGroup;
141 map<string, int>::iterator it;
144 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
145 remove((fastafile + "." + toString(i) + ".temp").c_str());
149 m->openInputFile(fastafile, in);
154 Sequence query(in); m->gobble(in);
155 if (query.getName() != "") {
157 it = seqGroup.find(query.getName());
159 //save names in case no namefile is given
160 if (namefile == "") { names.insert(query.getName()); }
162 if (it != seqGroup.end()) { //not singleton
163 m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
164 query.printSequence(outFile);
167 copyGroups.erase(query.getName());
173 //warn about sequence in groups that are not in fasta file
174 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
175 m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
181 //process each distance file
182 for (int i = 0; i < numGroups; i++) {
184 string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(cutoff);
186 Command* command = new DistanceCommand(options);
190 remove((fastafile + "." + toString(i) + ".temp").c_str());
192 //remove old names files just in case
193 remove((namefile + "." + toString(i) + ".temp").c_str());
196 singleton = namefile + ".extra.temp";
197 ofstream remainingNames;
198 m->openOutputFile(singleton, remainingNames);
200 bool wroteExtra = false;
202 ifstream bigNameFile;
203 m->openInputFile(namefile, bigNameFile);
205 string name, nameList;
206 while(!bigNameFile.eof()){
207 bigNameFile >> name >> nameList; m->gobble(bigNameFile);
209 //did this sequence get assigned a group
210 it = seqGroup.find(name);
212 if (it != seqGroup.end()) {
213 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
214 outFile << name << '\t' << nameList << endl;
218 remainingNames << name << '\t' << nameList << endl;
223 for(int i=0;i<numGroups;i++){
224 string tempNameFile = namefile + "." + toString(i) + ".temp";
225 if (outputDir == "") { outputDir = m->hasPath(fastafile); }
226 string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
228 //if there are valid distances
230 fileHandle.open(tempDistFile.c_str());
232 m->gobble(fileHandle);
233 if (!fileHandle.eof()) { //check for blank file - this could occur if all dists in group are above cutoff
234 map<string, string> temp;
235 temp[tempDistFile] = tempNameFile;
236 dists.push_back(temp);
239 m->openInputFile(tempNameFile, in);
242 in >> name >> nameList; m->gobble(in);
244 remainingNames << name << '\t' << nameList << endl;
247 remove(tempNameFile.c_str());
253 remainingNames.close();
255 remove(singleton.c_str());
259 if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); }
263 catch(exception& e) {
264 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
268 /***********************************************************************/
269 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
271 map<string, int>::iterator it;
272 map<string, int>::iterator it2;
275 m->openInputFile(distFile, dFile);
278 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
279 remove((distFile + "." + toString(i) + ".temp").c_str());
282 //for buffering the io to improve speed
283 //allow for 10 dists to be stored, then output.
284 vector<string> outputs; outputs.resize(numGroups, "");
285 vector<int> numOutputs; numOutputs.resize(numGroups, 0);
287 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
288 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
289 vector<bool> validDistances; validDistances.resize(numGroups, false);
296 if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } }
298 dFile >> seqA >> seqB >> dist; m->gobble(dFile);
300 //if both sequences are in the same group then they are within the cutoff
301 it = seqGroup.find(seqA);
302 it2 = seqGroup.find(seqB);
304 if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
305 if (it->second == it2->second) { //they are from the same group so add the distance
306 if (numOutputs[it->second] > 30) {
307 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
308 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
310 outputs[it->second] = "";
311 numOutputs[it->second] = 0;
312 validDistances[it->second] = true;
314 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
315 numOutputs[it->second]++;
322 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
323 remove((namefile + "." + toString(i) + ".temp").c_str());
325 //write out any remaining buffers
326 if (numOutputs[i] > 0) {
327 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
328 outFile << outputs[i];
332 validDistances[i] = true;
336 ifstream bigNameFile;
337 m->openInputFile(namefile, bigNameFile);
339 singleton = namefile + ".extra.temp";
340 ofstream remainingNames;
341 m->openOutputFile(singleton, remainingNames);
343 bool wroteExtra = false;
345 string name, nameList;
346 while(!bigNameFile.eof()){
347 bigNameFile >> name >> nameList; m->gobble(bigNameFile);
349 //did this sequence get assigned a group
350 it = seqGroup.find(name);
352 if (it != seqGroup.end()) {
353 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
354 outFile << name << '\t' << nameList << endl;
358 remainingNames << name << '\t' << nameList << endl;
363 for(int i=0;i<numGroups;i++){
364 string tempNameFile = namefile + "." + toString(i) + ".temp";
365 string tempDistFile = distFile + "." + toString(i) + ".temp";
367 //if there are valid distances
368 if (validDistances[i]) {
369 map<string, string> temp;
370 temp[tempDistFile] = tempNameFile;
371 dists.push_back(temp);
374 m->openInputFile(tempNameFile, in);
377 in >> name >> nameList; m->gobble(in);
379 remainingNames << name << '\t' << nameList << endl;
382 remove(tempNameFile.c_str());
386 remainingNames.close();
389 remove(singleton.c_str());
393 if (m->control_pressed) {
394 for (int i = 0; i < dists.size(); i++) {
395 remove((dists[i].begin()->first).c_str());
396 remove((dists[i].begin()->second).c_str());
403 catch(exception& e) {
404 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
408 /***********************************************************************/
409 int SplitMatrix::splitDistanceLarge(){
411 vector<set<string> > groups;
413 //for buffering the io to improve speed
414 //allow for 30 dists to be stored, then output.
415 vector<string> outputs;
416 vector<int> numOutputs;
417 vector<bool> wroteOutPut;
423 m->openInputFile(distFile, dFile);
429 dFile >> seqA >> seqB >> dist;
431 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
434 //cout << "in cutoff: " << dist << endl;
439 for(int i=0;i<numGroups;i++){
440 set<string>::iterator aIt = groups[i].find(seqA);
441 set<string>::iterator bIt = groups[i].find(seqB);
443 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
444 groups[i].insert(seqB);
448 //cout << "in aIt: " << groupID << endl;
451 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
452 groups[i].insert(seqA);
456 // cout << "in bIt: " << groupID << endl;
460 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
461 if(groupIDA < groupIDB){
462 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
463 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
464 groups[groupIDB].clear();
468 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
469 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
470 groups[groupIDA].clear();
477 //windows is gonna gag on the reuse of outFile, will need to make it local...
479 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
480 set<string> newGroup;
481 newGroup.insert(seqA);
482 newGroup.insert(seqB);
483 groups.push_back(newGroup);
485 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
486 outputs.push_back(tempOut);
487 numOutputs.push_back(1);
488 wroteOutPut.push_back(false);
493 string fileName = distFile + "." + toString(groupID) + ".temp";
495 //have we reached the max buffer size
496 if (numOutputs[groupID] > 60) { //write out sequence
497 outFile.open(fileName.c_str(), ios::app);
498 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
501 outputs[groupID] = "";
502 numOutputs[groupID] = 0;
503 wroteOutPut[groupID] = true;
505 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
506 numOutputs[groupID]++;
509 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
510 string row, column, distance;
511 if(groupIDA<groupIDB){
514 numOutputs[groupID] += numOutputs[groupIDB];
515 outputs[groupID] += outputs[groupIDB];
517 outputs[groupIDB] = "";
518 numOutputs[groupIDB] = 0;
520 //if groupB is written to file it is above buffer size so read and write to new merged file
521 if (wroteOutPut[groupIDB]) {
522 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
523 ifstream fileB(fileName2.c_str(), ios::ate);
525 outFile.open(fileName.c_str(), ios::app);
530 size = fileB.tellg();
532 fileB.seekg (0, ios::beg);
534 int numRead = size / 1024;
535 int lastRead = size % 1024;
537 for (int i = 0; i < numRead; i++) {
539 memblock = new char [1024];
541 fileB.read (memblock, 1024);
543 string temp = memblock;
544 outFile << temp.substr(0, 1024);
549 memblock = new char [lastRead];
551 fileB.read (memblock, lastRead);
553 //not sure why but it will read more than lastRead char...??
554 string temp = memblock;
555 outFile << temp.substr(0, lastRead);
559 remove(fileName2.c_str());
561 //write out the merged memory
562 if (numOutputs[groupID] > 60) {
563 outFile << outputs[groupID];
564 outputs[groupID] = "";
565 numOutputs[groupID] = 0;
570 wroteOutPut[groupID] = true;
571 wroteOutPut[groupIDB] = false;
572 }else{ } //just merge b's memory with a's memory
575 numOutputs[groupID] += numOutputs[groupIDA];
576 outputs[groupID] += outputs[groupIDA];
578 outputs[groupIDA] = "";
579 numOutputs[groupIDA] = 0;
581 if (wroteOutPut[groupIDA]) {
582 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
583 ifstream fileB(fileName2.c_str(), ios::ate);
585 outFile.open(fileName.c_str(), ios::app);
590 size = fileB.tellg();
592 fileB.seekg (0, ios::beg);
594 int numRead = size / 1024;
595 int lastRead = size % 1024;
597 for (int i = 0; i < numRead; i++) {
599 memblock = new char [1024];
601 fileB.read (memblock, 1024);
602 string temp = memblock;
603 outFile << temp.substr(0, 1024);
608 memblock = new char [lastRead];
610 fileB.read (memblock, lastRead);
612 //not sure why but it will read more than lastRead char...??
613 string temp = memblock;
614 outFile << temp.substr(0, lastRead);
619 remove(fileName2.c_str());
621 //write out the merged memory
622 if (numOutputs[groupID] > 60) {
623 outFile << outputs[groupID];
624 outputs[groupID] = "";
625 numOutputs[groupID] = 0;
630 wroteOutPut[groupID] = true;
631 wroteOutPut[groupIDA] = false;
632 }else { } //just merge memory
641 for (int i = 0; i < numGroups; i++) {
642 if (numOutputs[i] > 0) {
643 string fileName = distFile + "." + toString(i) + ".temp";
644 outFile.open(fileName.c_str(), ios::app);
645 outFile << outputs[i];
654 catch(exception& e) {
655 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
659 //********************************************************************************************************************
660 int SplitMatrix::splitNames(vector<set<string> >& groups){
662 int numGroups = groups.size();
664 ifstream bigNameFile(namefile.c_str());
666 cerr << "Error: We can't open the name file\n";
670 map<string, string> nameMap;
671 string name, nameList;
673 bigNameFile >> name >> nameList;
674 nameMap[name] = nameList;
675 m->gobble(bigNameFile);
679 for(int i=0;i<numGroups;i++){ //parse names file to match distance files
680 int numSeqsInGroup = groups[i].size();
682 if(numSeqsInGroup > 0){
683 string fileName = namefile + "." + toString(i) + ".temp";
684 ofstream smallNameFile(fileName.c_str(), ios::ate);
686 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
687 map<string,string>::iterator nIt = nameMap.find(*gIt);
688 if (nIt != nameMap.end()) {
689 smallNameFile << nIt->first << '\t' << nIt->second << endl;
692 m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1);
695 smallNameFile.close();
699 //names of singletons
700 if (nameMap.size() != 0) {
701 singleton = namefile + ".extra.temp";
702 ofstream remainingNames(singleton.c_str(), ios::ate);
703 for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
704 remainingNames << nIt->first << '\t' << nIt->second << endl;
706 remainingNames.close();
707 }else { singleton = "none"; }
709 for(int i=0;i<numGroups;i++){
710 if(groups[i].size() > 0){
711 string tempNameFile = namefile + "." + toString(i) + ".temp";
712 string tempDistFile = distFile + "." + toString(i) + ".temp";
714 map<string, string> temp;
715 temp[tempDistFile] = tempNameFile;
716 dists.push_back(temp);
720 if (m->control_pressed) {
721 for (int i = 0; i < dists.size(); i++) {
722 remove((dists[i].begin()->first).c_str());
723 remove((dists[i].begin()->second).c_str());
730 catch(exception& e) {
731 m->errorOut(e, "SplitMatrix", "splitNames");
735 //********************************************************************************************************************
736 int SplitMatrix::splitDistanceRAM(){
738 vector<set<string> > groups;
739 vector<string> outputs;
744 m->openInputFile(distFile, dFile);
750 dFile >> seqA >> seqB >> dist;
752 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
755 //cout << "in cutoff: " << dist << endl;
760 for(int i=0;i<numGroups;i++){
761 set<string>::iterator aIt = groups[i].find(seqA);
762 set<string>::iterator bIt = groups[i].find(seqB);
764 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
765 groups[i].insert(seqB);
769 //cout << "in aIt: " << groupID << endl;
772 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
773 groups[i].insert(seqA);
777 // cout << "in bIt: " << groupID << endl;
781 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
782 if(groupIDA < groupIDB){
783 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
784 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
785 groups[groupIDB].clear();
789 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
790 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
791 groups[groupIDA].clear();
798 //windows is gonna gag on the reuse of outFile, will need to make it local...
800 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
801 set<string> newGroup;
802 newGroup.insert(seqA);
803 newGroup.insert(seqB);
804 groups.push_back(newGroup);
806 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
807 outputs.push_back(tempOut);
812 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
814 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
815 string row, column, distance;
816 if(groupIDA<groupIDB){
818 outputs[groupID] += outputs[groupIDB];
819 outputs[groupIDB] = "";
821 outputs[groupID] += outputs[groupIDA];
822 outputs[groupIDA] = "";
831 for (int i = 0; i < numGroups; i++) {
832 if (outputs[i] != "") {
834 string fileName = distFile + "." + toString(i) + ".temp";
835 outFile.open(fileName.c_str(), ios::ate);
836 outFile << outputs[i];
845 catch(exception& e) {
846 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
850 //********************************************************************************************************************
851 //sorts biggest to smallest
852 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
857 //get num bytes in file
858 string filename = left.begin()->first;
859 pFile = fopen (filename.c_str(),"rb");
860 string error = "Error opening " + filename;
861 if (pFile==NULL) perror (error.c_str());
863 fseek (pFile, 0, SEEK_END);
864 leftsize=ftell (pFile);
871 //get num bytes in file
872 filename = right.begin()->first;
873 pFile2 = fopen (filename.c_str(),"rb");
874 error = "Error opening " + filename;
875 if (pFile2==NULL) perror (error.c_str());
877 fseek (pFile2, 0, SEEK_END);
878 rightsize=ftell (pFile2);
882 return (leftsize > rightsize);
884 /***********************************************************************/
885 //returns map of distance files -> namefile sorted by distance file size
886 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
889 sort(dists.begin(), dists.end(), compareFileSizes);
893 catch(exception& e) {
894 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
898 /***********************************************************************/
899 SplitMatrix::~SplitMatrix(){}
900 /***********************************************************************/