5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
14 /***********************************************************************/
16 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
17 m = MothurOut::getInstance();
25 /***********************************************************************/
27 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, string output){
28 m = MothurOut::getInstance();
32 cutoff = c; //tax level cutoff
33 distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
39 /***********************************************************************/
41 int SplitMatrix::split(){
44 if (method == "distance") {
46 }else if ((method == "classify") || (method == "fasta")) {
49 m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
50 map<string, string> temp;
51 temp[distFile] = namefile;
52 dists.push_back(temp);
58 m->errorOut(e, "SplitMatrix", "split");
62 /***********************************************************************/
63 int SplitMatrix::splitDistance(){
66 if (large) { splitDistanceLarge(); }
67 else { splitDistanceRAM(); }
71 m->errorOut(e, "SplitMatrix", "splitDistance");
76 /***********************************************************************/
77 int SplitMatrix::splitClassify(){
81 map<string, int> seqGroup;
82 map<string, int>::iterator it;
83 map<string, int>::iterator it2;
87 //build tree from users taxonomy file
88 PhyloTree* phylo = new PhyloTree();
91 m->openInputFile(taxFile, in);
93 //read in users taxonomy file and add sequences to tree
96 in >> seqname >> tax; m->gobble(in);
97 phylo->addSeqToTree(seqname, tax);
101 phylo->assignHeirarchyIDs(0);
103 //make sure the cutoff is not greater than maxlevel
104 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
106 //for each node in tree
107 for (int i = 0; i < phylo->getNumNodes(); i++) {
109 //is this node within the cutoff
110 TaxNode taxon = phylo->get(i);
112 if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
113 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
114 for (int j = 0; j < taxon.accessions.size(); j++) {
115 seqGroup[taxon.accessions[j]] = numGroups;
124 if (method == "classify") {
125 splitDistanceFileByTax(seqGroup, numGroups);
127 createDistanceFilesFromTax(seqGroup, numGroups);
133 catch(exception& e) {
134 m->errorOut(e, "SplitMatrix", "splitClassify");
138 /***********************************************************************/
139 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
141 map<string, int> copyGroups = seqGroup;
142 map<string, int>::iterator it;
145 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
146 remove((fastafile + "." + toString(i) + ".temp").c_str());
150 m->openInputFile(fastafile, in);
155 Sequence query(in); m->gobble(in);
156 if (query.getName() != "") {
158 it = seqGroup.find(query.getName());
160 //save names in case no namefile is given
161 if (namefile == "") { names.insert(query.getName()); }
163 if (it != seqGroup.end()) { //not singleton
164 m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
165 query.printSequence(outFile);
168 copyGroups.erase(query.getName());
174 //warn about sequence in groups that are not in fasta file
175 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
176 m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
182 //process each distance file
183 for (int i = 0; i < numGroups; i++) {
185 string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff);
187 Command* command = new DistanceCommand(options);
191 remove((fastafile + "." + toString(i) + ".temp").c_str());
193 //remove old names files just in case
194 remove((namefile + "." + toString(i) + ".temp").c_str());
197 singleton = namefile + ".extra.temp";
198 ofstream remainingNames;
199 m->openOutputFile(singleton, remainingNames);
201 bool wroteExtra = false;
203 ifstream bigNameFile;
204 m->openInputFile(namefile, bigNameFile);
206 string name, nameList;
207 while(!bigNameFile.eof()){
208 bigNameFile >> name >> nameList; m->gobble(bigNameFile);
210 //did this sequence get assigned a group
211 it = seqGroup.find(name);
213 if (it != seqGroup.end()) {
214 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
215 outFile << name << '\t' << nameList << endl;
219 remainingNames << name << '\t' << nameList << endl;
224 for(int i=0;i<numGroups;i++){
225 string tempNameFile = namefile + "." + toString(i) + ".temp";
226 if (outputDir == "") { outputDir = m->hasPath(fastafile); }
227 string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
229 //if there are valid distances
231 fileHandle.open(tempDistFile.c_str());
233 m->gobble(fileHandle);
234 if (!fileHandle.eof()) { //check for blank file - this could occur if all dists in group are above cutoff
235 map<string, string> temp;
236 temp[tempDistFile] = tempNameFile;
237 dists.push_back(temp);
240 m->openInputFile(tempNameFile, in);
243 in >> name >> nameList; m->gobble(in);
245 remainingNames << name << '\t' << nameList << endl;
248 remove(tempNameFile.c_str());
254 remainingNames.close();
256 remove(singleton.c_str());
260 if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); }
264 catch(exception& e) {
265 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
269 /***********************************************************************/
270 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
272 map<string, int>::iterator it;
273 map<string, int>::iterator it2;
276 m->openInputFile(distFile, dFile);
279 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
280 remove((distFile + "." + toString(i) + ".temp").c_str());
283 //for buffering the io to improve speed
284 //allow for 10 dists to be stored, then output.
285 vector<string> outputs; outputs.resize(numGroups, "");
286 vector<int> numOutputs; numOutputs.resize(numGroups, 0);
288 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
289 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
290 vector<bool> validDistances; validDistances.resize(numGroups, false);
297 if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } }
299 dFile >> seqA >> seqB >> dist; m->gobble(dFile);
301 //if both sequences are in the same group then they are within the cutoff
302 it = seqGroup.find(seqA);
303 it2 = seqGroup.find(seqB);
305 if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
306 if (it->second == it2->second) { //they are from the same group so add the distance
307 if (numOutputs[it->second] > 30) {
308 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
309 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
311 outputs[it->second] = "";
312 numOutputs[it->second] = 0;
313 validDistances[it->second] = true;
315 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
316 numOutputs[it->second]++;
323 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
324 remove((namefile + "." + toString(i) + ".temp").c_str());
326 //write out any remaining buffers
327 if (numOutputs[i] > 0) {
328 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
329 outFile << outputs[i];
333 validDistances[i] = true;
337 ifstream bigNameFile;
338 m->openInputFile(namefile, bigNameFile);
340 singleton = namefile + ".extra.temp";
341 ofstream remainingNames;
342 m->openOutputFile(singleton, remainingNames);
344 bool wroteExtra = false;
346 string name, nameList;
347 while(!bigNameFile.eof()){
348 bigNameFile >> name >> nameList; m->gobble(bigNameFile);
350 //did this sequence get assigned a group
351 it = seqGroup.find(name);
353 if (it != seqGroup.end()) {
354 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
355 outFile << name << '\t' << nameList << endl;
359 remainingNames << name << '\t' << nameList << endl;
364 for(int i=0;i<numGroups;i++){
365 string tempNameFile = namefile + "." + toString(i) + ".temp";
366 string tempDistFile = distFile + "." + toString(i) + ".temp";
368 //if there are valid distances
369 if (validDistances[i]) {
370 map<string, string> temp;
371 temp[tempDistFile] = tempNameFile;
372 dists.push_back(temp);
375 m->openInputFile(tempNameFile, in);
378 in >> name >> nameList; m->gobble(in);
380 remainingNames << name << '\t' << nameList << endl;
383 remove(tempNameFile.c_str());
387 remainingNames.close();
390 remove(singleton.c_str());
394 if (m->control_pressed) {
395 for (int i = 0; i < dists.size(); i++) {
396 remove((dists[i].begin()->first).c_str());
397 remove((dists[i].begin()->second).c_str());
404 catch(exception& e) {
405 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
409 /***********************************************************************/
410 int SplitMatrix::splitDistanceLarge(){
412 vector<set<string> > groups;
414 //for buffering the io to improve speed
415 //allow for 30 dists to be stored, then output.
416 vector<string> outputs;
417 vector<int> numOutputs;
418 vector<bool> wroteOutPut;
424 m->openInputFile(distFile, dFile);
430 dFile >> seqA >> seqB >> dist;
432 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
435 //cout << "in cutoff: " << dist << endl;
440 for(int i=0;i<numGroups;i++){
441 set<string>::iterator aIt = groups[i].find(seqA);
442 set<string>::iterator bIt = groups[i].find(seqB);
444 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
445 groups[i].insert(seqB);
449 //cout << "in aIt: " << groupID << endl;
452 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
453 groups[i].insert(seqA);
457 // cout << "in bIt: " << groupID << endl;
461 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
462 if(groupIDA < groupIDB){
463 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
464 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
465 groups[groupIDB].clear();
469 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
470 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
471 groups[groupIDA].clear();
478 //windows is gonna gag on the reuse of outFile, will need to make it local...
480 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
481 set<string> newGroup;
482 newGroup.insert(seqA);
483 newGroup.insert(seqB);
484 groups.push_back(newGroup);
486 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
487 outputs.push_back(tempOut);
488 numOutputs.push_back(1);
489 wroteOutPut.push_back(false);
494 string fileName = distFile + "." + toString(groupID) + ".temp";
496 //have we reached the max buffer size
497 if (numOutputs[groupID] > 60) { //write out sequence
498 outFile.open(fileName.c_str(), ios::app);
499 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
502 outputs[groupID] = "";
503 numOutputs[groupID] = 0;
504 wroteOutPut[groupID] = true;
506 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
507 numOutputs[groupID]++;
510 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
511 string row, column, distance;
512 if(groupIDA<groupIDB){
515 numOutputs[groupID] += numOutputs[groupIDB];
516 outputs[groupID] += outputs[groupIDB];
518 outputs[groupIDB] = "";
519 numOutputs[groupIDB] = 0;
521 //if groupB is written to file it is above buffer size so read and write to new merged file
522 if (wroteOutPut[groupIDB]) {
523 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
524 ifstream fileB(fileName2.c_str(), ios::ate);
526 outFile.open(fileName.c_str(), ios::app);
531 size = fileB.tellg();
533 fileB.seekg (0, ios::beg);
535 int numRead = size / 1024;
536 int lastRead = size % 1024;
538 for (int i = 0; i < numRead; i++) {
540 memblock = new char [1024];
542 fileB.read (memblock, 1024);
544 string temp = memblock;
545 outFile << temp.substr(0, 1024);
550 memblock = new char [lastRead];
552 fileB.read (memblock, lastRead);
554 //not sure why but it will read more than lastRead char...??
555 string temp = memblock;
556 outFile << temp.substr(0, lastRead);
560 remove(fileName2.c_str());
562 //write out the merged memory
563 if (numOutputs[groupID] > 60) {
564 outFile << outputs[groupID];
565 outputs[groupID] = "";
566 numOutputs[groupID] = 0;
571 wroteOutPut[groupID] = true;
572 wroteOutPut[groupIDB] = false;
573 }else{ } //just merge b's memory with a's memory
576 numOutputs[groupID] += numOutputs[groupIDA];
577 outputs[groupID] += outputs[groupIDA];
579 outputs[groupIDA] = "";
580 numOutputs[groupIDA] = 0;
582 if (wroteOutPut[groupIDA]) {
583 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
584 ifstream fileB(fileName2.c_str(), ios::ate);
586 outFile.open(fileName.c_str(), ios::app);
591 size = fileB.tellg();
593 fileB.seekg (0, ios::beg);
595 int numRead = size / 1024;
596 int lastRead = size % 1024;
598 for (int i = 0; i < numRead; i++) {
600 memblock = new char [1024];
602 fileB.read (memblock, 1024);
603 string temp = memblock;
604 outFile << temp.substr(0, 1024);
609 memblock = new char [lastRead];
611 fileB.read (memblock, lastRead);
613 //not sure why but it will read more than lastRead char...??
614 string temp = memblock;
615 outFile << temp.substr(0, lastRead);
620 remove(fileName2.c_str());
622 //write out the merged memory
623 if (numOutputs[groupID] > 60) {
624 outFile << outputs[groupID];
625 outputs[groupID] = "";
626 numOutputs[groupID] = 0;
631 wroteOutPut[groupID] = true;
632 wroteOutPut[groupIDA] = false;
633 }else { } //just merge memory
642 for (int i = 0; i < numGroups; i++) {
643 if (numOutputs[i] > 0) {
644 string fileName = distFile + "." + toString(i) + ".temp";
645 outFile.open(fileName.c_str(), ios::app);
646 outFile << outputs[i];
655 catch(exception& e) {
656 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
660 //********************************************************************************************************************
661 int SplitMatrix::splitNames(vector<set<string> >& groups){
663 int numGroups = groups.size();
665 ifstream bigNameFile(namefile.c_str());
667 cerr << "Error: We can't open the name file\n";
671 map<string, string> nameMap;
672 string name, nameList;
674 bigNameFile >> name >> nameList;
675 nameMap[name] = nameList;
676 m->gobble(bigNameFile);
680 for(int i=0;i<numGroups;i++){ //parse names file to match distance files
681 int numSeqsInGroup = groups[i].size();
683 if(numSeqsInGroup > 0){
684 string fileName = namefile + "." + toString(i) + ".temp";
685 ofstream smallNameFile(fileName.c_str(), ios::ate);
687 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
688 map<string,string>::iterator nIt = nameMap.find(*gIt);
689 if (nIt != nameMap.end()) {
690 smallNameFile << nIt->first << '\t' << nIt->second << endl;
693 m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1);
696 smallNameFile.close();
700 //names of singletons
701 if (nameMap.size() != 0) {
702 singleton = namefile + ".extra.temp";
703 ofstream remainingNames(singleton.c_str(), ios::ate);
704 for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
705 remainingNames << nIt->first << '\t' << nIt->second << endl;
707 remainingNames.close();
708 }else { singleton = "none"; }
710 for(int i=0;i<numGroups;i++){
711 if(groups[i].size() > 0){
712 string tempNameFile = namefile + "." + toString(i) + ".temp";
713 string tempDistFile = distFile + "." + toString(i) + ".temp";
715 map<string, string> temp;
716 temp[tempDistFile] = tempNameFile;
717 dists.push_back(temp);
721 if (m->control_pressed) {
722 for (int i = 0; i < dists.size(); i++) {
723 remove((dists[i].begin()->first).c_str());
724 remove((dists[i].begin()->second).c_str());
731 catch(exception& e) {
732 m->errorOut(e, "SplitMatrix", "splitNames");
736 //********************************************************************************************************************
737 int SplitMatrix::splitDistanceRAM(){
739 vector<set<string> > groups;
740 vector<string> outputs;
745 m->openInputFile(distFile, dFile);
751 dFile >> seqA >> seqB >> dist;
753 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
756 //cout << "in cutoff: " << dist << endl;
761 for(int i=0;i<numGroups;i++){
762 set<string>::iterator aIt = groups[i].find(seqA);
763 set<string>::iterator bIt = groups[i].find(seqB);
765 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
766 groups[i].insert(seqB);
770 //cout << "in aIt: " << groupID << endl;
773 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
774 groups[i].insert(seqA);
778 // cout << "in bIt: " << groupID << endl;
782 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
783 if(groupIDA < groupIDB){
784 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
785 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
786 groups[groupIDB].clear();
790 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
791 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
792 groups[groupIDA].clear();
799 //windows is gonna gag on the reuse of outFile, will need to make it local...
801 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
802 set<string> newGroup;
803 newGroup.insert(seqA);
804 newGroup.insert(seqB);
805 groups.push_back(newGroup);
807 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
808 outputs.push_back(tempOut);
813 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
815 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
816 string row, column, distance;
817 if(groupIDA<groupIDB){
819 outputs[groupID] += outputs[groupIDB];
820 outputs[groupIDB] = "";
822 outputs[groupID] += outputs[groupIDA];
823 outputs[groupIDA] = "";
832 for (int i = 0; i < numGroups; i++) {
833 if (outputs[i] != "") {
835 string fileName = distFile + "." + toString(i) + ".temp";
836 outFile.open(fileName.c_str(), ios::ate);
837 outFile << outputs[i];
846 catch(exception& e) {
847 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
851 //********************************************************************************************************************
852 //sorts biggest to smallest
853 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
858 //get num bytes in file
859 string filename = left.begin()->first;
860 pFile = fopen (filename.c_str(),"rb");
861 string error = "Error opening " + filename;
862 if (pFile==NULL) perror (error.c_str());
864 fseek (pFile, 0, SEEK_END);
865 leftsize=ftell (pFile);
872 //get num bytes in file
873 filename = right.begin()->first;
874 pFile2 = fopen (filename.c_str(),"rb");
875 error = "Error opening " + filename;
876 if (pFile2==NULL) perror (error.c_str());
878 fseek (pFile2, 0, SEEK_END);
879 rightsize=ftell (pFile2);
883 return (leftsize > rightsize);
885 /***********************************************************************/
886 //returns map of distance files -> namefile sorted by distance file size
887 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
890 sort(dists.begin(), dists.end(), compareFileSizes);
894 catch(exception& e) {
895 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
899 /***********************************************************************/
900 SplitMatrix::~SplitMatrix(){}
901 /***********************************************************************/