5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
15 /***********************************************************************/
17 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
18 m = MothurOut::getInstance();
26 /***********************************************************************/
28 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, bool cl, string output){
29 m = MothurOut::getInstance();
33 cutoff = c; //tax level cutoff
34 distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
41 /***********************************************************************/
43 int SplitMatrix::split(){
46 if (method == "distance") {
48 }else if ((method == "classify") || (method == "fasta")) {
51 m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
52 map<string, string> temp;
53 temp[distFile] = namefile;
54 dists.push_back(temp);
60 m->errorOut(e, "SplitMatrix", "split");
64 /***********************************************************************/
65 int SplitMatrix::splitDistance(){
68 if (large) { splitDistanceLarge(); }
69 else { splitDistanceRAM(); }
75 m->errorOut(e, "SplitMatrix", "splitDistance");
80 /***********************************************************************/
81 int SplitMatrix::splitClassify(){
85 map<string, int> seqGroup;
86 map<string, int>::iterator it;
87 map<string, int>::iterator it2;
91 //build tree from users taxonomy file
92 PhyloTree* phylo = new PhyloTree();
95 m->openInputFile(taxFile, in);
97 //read in users taxonomy file and add sequences to tree
100 in >> seqname >> tax; m->gobble(in);
101 phylo->addSeqToTree(seqname, tax);
105 phylo->assignHeirarchyIDs(0);
107 //make sure the cutoff is not greater than maxlevel
108 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
110 //for each node in tree
111 for (int i = 0; i < phylo->getNumNodes(); i++) {
113 //is this node within the cutoff
114 TaxNode taxon = phylo->get(i);
116 if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
117 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
118 for (int j = 0; j < taxon.accessions.size(); j++) {
119 seqGroup[taxon.accessions[j]] = numGroups;
128 if (method == "classify") {
129 splitDistanceFileByTax(seqGroup, numGroups);
131 createDistanceFilesFromTax(seqGroup, numGroups);
137 catch(exception& e) {
138 m->errorOut(e, "SplitMatrix", "splitClassify");
142 /***********************************************************************/
143 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
145 map<string, int> copyGroups = seqGroup;
146 map<string, int>::iterator it;
149 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
150 m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
154 m->openInputFile(fastafile, in);
159 Sequence query(in); m->gobble(in);
160 if (query.getName() != "") {
162 it = seqGroup.find(query.getName());
164 //save names in case no namefile is given
165 if (namefile == "") { names.insert(query.getName()); }
167 if (it != seqGroup.end()) { //not singleton
168 m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
169 query.printSequence(outFile);
172 copyGroups.erase(query.getName());
178 //warn about sequence in groups that are not in fasta file
179 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
180 m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
186 //process each distance file
187 for (int i = 0; i < numGroups; i++) {
190 if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; }
191 else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); }
192 if (outputDir != "") { options += ", outputdir=" + outputDir; }
194 Command* command = new DistanceCommand(options);
199 m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
201 //remove old names files just in case
202 m->mothurRemove((namefile + "." + toString(i) + ".temp"));
205 singleton = namefile + ".extra.temp";
206 ofstream remainingNames;
207 m->openOutputFile(singleton, remainingNames);
209 bool wroteExtra = false;
211 ifstream bigNameFile;
212 m->openInputFile(namefile, bigNameFile);
214 string name, nameList;
215 while(!bigNameFile.eof()){
216 bigNameFile >> name >> nameList; m->gobble(bigNameFile);
218 //did this sequence get assigned a group
219 it = seqGroup.find(name);
221 if (it != seqGroup.end()) {
222 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
223 outFile << name << '\t' << nameList << endl;
227 remainingNames << name << '\t' << nameList << endl;
232 for(int i=0;i<numGroups;i++){
233 string tempNameFile = namefile + "." + toString(i) + ".temp";
234 if (outputDir == "") { outputDir = m->hasPath(fastafile); }
235 string tempDistFile = "";
236 if (classic) { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";}
237 else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; }
239 //if there are valid distances
241 fileHandle.open(tempDistFile.c_str());
243 m->gobble(fileHandle);
244 if (!fileHandle.eof()) { //check for blank file - this could occur if all dists in group are above cutoff
245 map<string, string> temp;
246 temp[tempDistFile] = tempNameFile;
247 dists.push_back(temp);
250 m->openInputFile(tempNameFile, in);
253 in >> name >> nameList; m->gobble(in);
255 remainingNames << name << '\t' << nameList << endl;
258 m->mothurRemove(tempNameFile);
264 remainingNames.close();
266 m->mothurRemove(singleton);
270 if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
274 catch(exception& e) {
275 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
279 /***********************************************************************/
280 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
282 map<string, int>::iterator it;
283 map<string, int>::iterator it2;
286 m->openInputFile(distFile, dFile);
289 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
290 m->mothurRemove((distFile + "." + toString(i) + ".temp"));
293 //for buffering the io to improve speed
294 //allow for 10 dists to be stored, then output.
295 vector<string> outputs; outputs.resize(numGroups, "");
296 vector<int> numOutputs; numOutputs.resize(numGroups, 0);
298 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
299 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
300 vector<bool> validDistances; validDistances.resize(numGroups, false);
307 if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp")); } }
309 dFile >> seqA >> seqB >> dist; m->gobble(dFile);
311 //if both sequences are in the same group then they are within the cutoff
312 it = seqGroup.find(seqA);
313 it2 = seqGroup.find(seqB);
315 if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
316 if (it->second == it2->second) { //they are from the same group so add the distance
317 if (numOutputs[it->second] > 30) {
318 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
319 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
321 outputs[it->second] = "";
322 numOutputs[it->second] = 0;
323 validDistances[it->second] = true;
325 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
326 numOutputs[it->second]++;
333 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
334 m->mothurRemove((namefile + "." + toString(i) + ".temp"));
336 //write out any remaining buffers
337 if (numOutputs[i] > 0) {
338 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
339 outFile << outputs[i];
343 validDistances[i] = true;
347 ifstream bigNameFile;
348 m->openInputFile(namefile, bigNameFile);
350 singleton = namefile + ".extra.temp";
351 ofstream remainingNames;
352 m->openOutputFile(singleton, remainingNames);
354 bool wroteExtra = false;
356 string name, nameList;
357 while(!bigNameFile.eof()){
358 bigNameFile >> name >> nameList; m->gobble(bigNameFile);
360 //did this sequence get assigned a group
361 it = seqGroup.find(name);
363 if (it != seqGroup.end()) {
364 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
365 outFile << name << '\t' << nameList << endl;
369 remainingNames << name << '\t' << nameList << endl;
374 for(int i=0;i<numGroups;i++){
375 string tempNameFile = namefile + "." + toString(i) + ".temp";
376 string tempDistFile = distFile + "." + toString(i) + ".temp";
378 //if there are valid distances
379 if (validDistances[i]) {
380 map<string, string> temp;
381 temp[tempDistFile] = tempNameFile;
382 dists.push_back(temp);
385 m->openInputFile(tempNameFile, in);
388 in >> name >> nameList; m->gobble(in);
390 remainingNames << name << '\t' << nameList << endl;
393 m->mothurRemove(tempNameFile);
397 remainingNames.close();
400 m->mothurRemove(singleton);
404 if (m->control_pressed) {
405 for (int i = 0; i < dists.size(); i++) {
406 m->mothurRemove((dists[i].begin()->first));
407 m->mothurRemove((dists[i].begin()->second));
414 catch(exception& e) {
415 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
419 /***********************************************************************/
420 int SplitMatrix::splitDistanceLarge(){
422 vector<set<string> > groups;
424 //for buffering the io to improve speed
425 //allow for 30 dists to be stored, then output.
426 vector<string> outputs;
427 vector<int> numOutputs;
428 vector<bool> wroteOutPut;
434 m->openInputFile(distFile, dFile);
440 dFile >> seqA >> seqB >> dist;
442 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; }
445 //cout << "in cutoff: " << dist << endl;
450 for(int i=0;i<numGroups;i++){
451 set<string>::iterator aIt = groups[i].find(seqA);
452 set<string>::iterator bIt = groups[i].find(seqB);
454 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
455 groups[i].insert(seqB);
459 //cout << "in aIt: " << groupID << endl;
462 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
463 groups[i].insert(seqA);
467 // cout << "in bIt: " << groupID << endl;
471 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
472 if(groupIDA < groupIDB){
473 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
474 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
475 groups[groupIDB].clear();
479 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
480 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
481 groups[groupIDA].clear();
488 //windows is gonna gag on the reuse of outFile, will need to make it local...
490 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
491 set<string> newGroup;
492 newGroup.insert(seqA);
493 newGroup.insert(seqB);
494 groups.push_back(newGroup);
496 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
497 outputs.push_back(tempOut);
498 numOutputs.push_back(1);
499 wroteOutPut.push_back(false);
504 string fileName = distFile + "." + toString(groupID) + ".temp";
506 //have we reached the max buffer size
507 if (numOutputs[groupID] > 60) { //write out sequence
508 outFile.open(fileName.c_str(), ios::app);
509 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
512 outputs[groupID] = "";
513 numOutputs[groupID] = 0;
514 wroteOutPut[groupID] = true;
516 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
517 numOutputs[groupID]++;
520 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
521 string row, column, distance;
522 if(groupIDA<groupIDB){
525 numOutputs[groupID] += numOutputs[groupIDB];
526 outputs[groupID] += outputs[groupIDB];
528 outputs[groupIDB] = "";
529 numOutputs[groupIDB] = 0;
531 //if groupB is written to file it is above buffer size so read and write to new merged file
532 if (wroteOutPut[groupIDB]) {
533 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
534 ifstream fileB(fileName2.c_str(), ios::ate);
536 outFile.open(fileName.c_str(), ios::app);
541 size = fileB.tellg();
543 fileB.seekg (0, ios::beg);
545 int numRead = size / 1024;
546 int lastRead = size % 1024;
548 for (int i = 0; i < numRead; i++) {
550 memblock = new char [1024];
552 fileB.read (memblock, 1024);
554 string temp = memblock;
555 outFile << temp.substr(0, 1024);
560 memblock = new char [lastRead];
562 fileB.read (memblock, lastRead);
564 //not sure why but it will read more than lastRead char...??
565 string temp = memblock;
566 outFile << temp.substr(0, lastRead);
570 m->mothurRemove(fileName2);
572 //write out the merged memory
573 if (numOutputs[groupID] > 60) {
574 outFile << outputs[groupID];
575 outputs[groupID] = "";
576 numOutputs[groupID] = 0;
581 wroteOutPut[groupID] = true;
582 wroteOutPut[groupIDB] = false;
583 }else{ } //just merge b's memory with a's memory
586 numOutputs[groupID] += numOutputs[groupIDA];
587 outputs[groupID] += outputs[groupIDA];
589 outputs[groupIDA] = "";
590 numOutputs[groupIDA] = 0;
592 if (wroteOutPut[groupIDA]) {
593 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
594 ifstream fileB(fileName2.c_str(), ios::ate);
596 outFile.open(fileName.c_str(), ios::app);
601 size = fileB.tellg();
603 fileB.seekg (0, ios::beg);
605 int numRead = size / 1024;
606 int lastRead = size % 1024;
608 for (int i = 0; i < numRead; i++) {
610 memblock = new char [1024];
612 fileB.read (memblock, 1024);
613 string temp = memblock;
614 outFile << temp.substr(0, 1024);
619 memblock = new char [lastRead];
621 fileB.read (memblock, lastRead);
623 //not sure why but it will read more than lastRead char...??
624 string temp = memblock;
625 outFile << temp.substr(0, lastRead);
630 m->mothurRemove(fileName2);
632 //write out the merged memory
633 if (numOutputs[groupID] > 60) {
634 outFile << outputs[groupID];
635 outputs[groupID] = "";
636 numOutputs[groupID] = 0;
641 wroteOutPut[groupID] = true;
642 wroteOutPut[groupIDA] = false;
643 }else { } //just merge memory
652 for (int i = 0; i < numGroups; i++) {
653 if (numOutputs[i] > 0) {
654 string fileName = distFile + "." + toString(i) + ".temp";
655 outFile.open(fileName.c_str(), ios::app);
656 outFile << outputs[i];
665 catch(exception& e) {
666 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
670 //********************************************************************************************************************
671 int SplitMatrix::splitNames(vector<set<string> >& groups){
673 int numGroups = groups.size();
675 ifstream bigNameFile(namefile.c_str());
677 cerr << "Error: We can't open the name file\n";
681 map<string, string> nameMap;
682 string name, nameList;
684 bigNameFile >> name >> nameList;
685 nameMap[name] = nameList;
686 m->gobble(bigNameFile);
690 for(int i=0;i<numGroups;i++){ //parse names file to match distance files
691 int numSeqsInGroup = groups[i].size();
693 if(numSeqsInGroup > 0){
694 string fileName = namefile + "." + toString(i) + ".temp";
695 ofstream smallNameFile(fileName.c_str(), ios::ate);
697 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
698 map<string,string>::iterator nIt = nameMap.find(*gIt);
699 if (nIt != nameMap.end()) {
700 smallNameFile << nIt->first << '\t' << nIt->second << endl;
703 m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1);
706 smallNameFile.close();
710 //names of singletons
711 if (nameMap.size() != 0) {
712 singleton = namefile + ".extra.temp";
713 ofstream remainingNames(singleton.c_str(), ios::ate);
714 for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
715 remainingNames << nIt->first << '\t' << nIt->second << endl;
717 remainingNames.close();
718 }else { singleton = "none"; }
720 for(int i=0;i<numGroups;i++){
721 if(groups[i].size() > 0){
722 string tempNameFile = namefile + "." + toString(i) + ".temp";
723 string tempDistFile = distFile + "." + toString(i) + ".temp";
725 map<string, string> temp;
726 temp[tempDistFile] = tempNameFile;
727 dists.push_back(temp);
731 if (m->control_pressed) {
732 for (int i = 0; i < dists.size(); i++) {
733 m->mothurRemove((dists[i].begin()->first));
734 m->mothurRemove((dists[i].begin()->second));
741 catch(exception& e) {
742 m->errorOut(e, "SplitMatrix", "splitNames");
746 //********************************************************************************************************************
747 int SplitMatrix::splitDistanceRAM(){
749 vector<set<string> > groups;
750 vector<string> outputs;
755 m->openInputFile(distFile, dFile);
761 dFile >> seqA >> seqB >> dist;
763 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; }
766 //cout << "in cutoff: " << dist << endl;
771 for(int i=0;i<numGroups;i++){
772 set<string>::iterator aIt = groups[i].find(seqA);
773 set<string>::iterator bIt = groups[i].find(seqB);
775 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
776 groups[i].insert(seqB);
780 //cout << "in aIt: " << groupID << endl;
783 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
784 groups[i].insert(seqA);
788 // cout << "in bIt: " << groupID << endl;
792 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
793 if(groupIDA < groupIDB){
794 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
795 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
796 groups[groupIDB].clear();
800 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
801 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
802 groups[groupIDA].clear();
809 //windows is gonna gag on the reuse of outFile, will need to make it local...
811 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
812 set<string> newGroup;
813 newGroup.insert(seqA);
814 newGroup.insert(seqB);
815 groups.push_back(newGroup);
817 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
818 outputs.push_back(tempOut);
823 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
825 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
826 string row, column, distance;
827 if(groupIDA<groupIDB){
829 outputs[groupID] += outputs[groupIDB];
830 outputs[groupIDB] = "";
832 outputs[groupID] += outputs[groupIDA];
833 outputs[groupIDA] = "";
842 for (int i = 0; i < numGroups; i++) {
843 if (outputs[i] != "") {
845 string fileName = distFile + "." + toString(i) + ".temp";
846 outFile.open(fileName.c_str(), ios::ate);
847 outFile << outputs[i];
856 catch(exception& e) {
857 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
861 //********************************************************************************************************************
862 //sorts biggest to smallest
863 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
868 //get num bytes in file
869 string filename = left.begin()->first;
870 pFile = fopen (filename.c_str(),"rb");
871 string error = "Error opening " + filename;
872 if (pFile==NULL) perror (error.c_str());
874 fseek (pFile, 0, SEEK_END);
875 leftsize=ftell (pFile);
882 //get num bytes in file
883 filename = right.begin()->first;
884 pFile2 = fopen (filename.c_str(),"rb");
885 error = "Error opening " + filename;
886 if (pFile2==NULL) perror (error.c_str());
888 fseek (pFile2, 0, SEEK_END);
889 rightsize=ftell (pFile2);
893 return (leftsize > rightsize);
895 /***********************************************************************/
896 //returns map of distance files -> namefile sorted by distance file size
897 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
900 sort(dists.begin(), dists.end(), compareFileSizes);
904 catch(exception& e) {
905 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
909 /***********************************************************************/
910 SplitMatrix::~SplitMatrix(){}
911 /***********************************************************************/