5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
15 /***********************************************************************/
17 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
18 m = MothurOut::getInstance();
26 /***********************************************************************/
28 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, bool cl, string output){
29 m = MothurOut::getInstance();
33 cutoff = c; //tax level cutoff
34 distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
41 /***********************************************************************/
43 int SplitMatrix::split(){
46 if (method == "distance") {
48 }else if ((method == "classify") || (method == "fasta")) {
51 m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
52 map<string, string> temp;
53 temp[distFile] = namefile;
54 dists.push_back(temp);
60 m->errorOut(e, "SplitMatrix", "split");
64 /***********************************************************************/
65 int SplitMatrix::splitDistance(){
68 if (large) { splitDistanceLarge(); }
69 else { splitDistanceRAM(); }
75 m->errorOut(e, "SplitMatrix", "splitDistance");
80 /***********************************************************************/
81 int SplitMatrix::splitClassify(){
85 map<string, int> seqGroup;
86 map<string, int>::iterator it;
87 map<string, int>::iterator it2;
91 //build tree from users taxonomy file
92 PhyloTree* phylo = new PhyloTree();
94 map<string, string> temp;
95 m->readTax(taxFile, temp);
97 for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
98 phylo->addSeqToTree(itTemp->first, itTemp->second);
102 phylo->assignHeirarchyIDs(0);
104 //make sure the cutoff is not greater than maxlevel
105 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
107 //for each node in tree
108 for (int i = 0; i < phylo->getNumNodes(); i++) {
110 //is this node within the cutoff
111 TaxNode taxon = phylo->get(i);
113 if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
114 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
115 for (int j = 0; j < taxon.accessions.size(); j++) {
116 seqGroup[taxon.accessions[j]] = numGroups;
125 if (method == "classify") {
126 splitDistanceFileByTax(seqGroup, numGroups);
128 createDistanceFilesFromTax(seqGroup, numGroups);
134 catch(exception& e) {
135 m->errorOut(e, "SplitMatrix", "splitClassify");
139 /***********************************************************************/
140 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
142 map<string, int> copyGroups = seqGroup;
143 map<string, int>::iterator it;
146 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
147 m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
151 m->openInputFile(fastafile, in);
156 Sequence query(in); m->gobble(in);
157 if (query.getName() != "") {
159 it = seqGroup.find(query.getName());
161 //save names in case no namefile is given
162 if (namefile == "") { names.insert(query.getName()); }
164 if (it != seqGroup.end()) { //not singleton
165 m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
166 query.printSequence(outFile);
169 copyGroups.erase(query.getName());
175 //warn about sequence in groups that are not in fasta file
176 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
177 m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
183 //process each distance file
184 for (int i = 0; i < numGroups; i++) {
187 if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; }
188 else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); }
189 if (outputDir != "") { options += ", outputdir=" + outputDir; }
191 Command* command = new DistanceCommand(options);
196 m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
198 //remove old names files just in case
199 m->mothurRemove((namefile + "." + toString(i) + ".temp"));
202 singleton = namefile + ".extra.temp";
203 ofstream remainingNames;
204 m->openOutputFile(singleton, remainingNames);
206 bool wroteExtra = false;
208 ifstream bigNameFile;
209 m->openInputFile(namefile, bigNameFile);
211 string name, nameList;
212 while(!bigNameFile.eof()){
213 bigNameFile >> name >> nameList; m->gobble(bigNameFile);
215 //did this sequence get assigned a group
216 it = seqGroup.find(name);
218 if (it != seqGroup.end()) {
219 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
220 outFile << name << '\t' << nameList << endl;
224 remainingNames << name << '\t' << nameList << endl;
229 for(int i=0;i<numGroups;i++){
230 string tempNameFile = namefile + "." + toString(i) + ".temp";
231 if (outputDir == "") { outputDir = m->hasPath(fastafile); }
232 string tempDistFile = "";
233 if (classic) { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";}
234 else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; }
236 //if there are valid distances
238 fileHandle.open(tempDistFile.c_str());
240 m->gobble(fileHandle);
241 if (!fileHandle.eof()) { //check for blank file - this could occur if all dists in group are above cutoff
242 map<string, string> temp;
243 temp[tempDistFile] = tempNameFile;
244 dists.push_back(temp);
247 m->openInputFile(tempNameFile, in);
250 in >> name >> nameList; m->gobble(in);
252 remainingNames << name << '\t' << nameList << endl;
255 m->mothurRemove(tempNameFile);
261 remainingNames.close();
263 m->mothurRemove(singleton);
267 if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
271 catch(exception& e) {
272 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
276 /***********************************************************************/
277 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
279 map<string, int>::iterator it;
280 map<string, int>::iterator it2;
283 m->openInputFile(distFile, dFile);
286 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
287 m->mothurRemove((distFile + "." + toString(i) + ".temp"));
290 //for buffering the io to improve speed
291 //allow for 10 dists to be stored, then output.
292 vector<string> outputs; outputs.resize(numGroups, "");
293 vector<int> numOutputs; numOutputs.resize(numGroups, 0);
295 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
296 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
297 vector<bool> validDistances; validDistances.resize(numGroups, false);
304 if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp")); } }
306 dFile >> seqA >> seqB >> dist; m->gobble(dFile);
308 //if both sequences are in the same group then they are within the cutoff
309 it = seqGroup.find(seqA);
310 it2 = seqGroup.find(seqB);
312 if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
313 if (it->second == it2->second) { //they are from the same group so add the distance
314 if (numOutputs[it->second] > 30) {
315 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
316 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
318 outputs[it->second] = "";
319 numOutputs[it->second] = 0;
320 validDistances[it->second] = true;
322 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
323 numOutputs[it->second]++;
330 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
331 m->mothurRemove((namefile + "." + toString(i) + ".temp"));
333 //write out any remaining buffers
334 if (numOutputs[i] > 0) {
335 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
336 outFile << outputs[i];
340 validDistances[i] = true;
344 ifstream bigNameFile;
345 m->openInputFile(namefile, bigNameFile);
347 singleton = namefile + ".extra.temp";
348 ofstream remainingNames;
349 m->openOutputFile(singleton, remainingNames);
351 bool wroteExtra = false;
353 string name, nameList;
354 while(!bigNameFile.eof()){
355 bigNameFile >> name >> nameList; m->gobble(bigNameFile);
357 //did this sequence get assigned a group
358 it = seqGroup.find(name);
360 if (it != seqGroup.end()) {
361 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
362 outFile << name << '\t' << nameList << endl;
366 remainingNames << name << '\t' << nameList << endl;
371 for(int i=0;i<numGroups;i++){
372 string tempNameFile = namefile + "." + toString(i) + ".temp";
373 string tempDistFile = distFile + "." + toString(i) + ".temp";
375 //if there are valid distances
376 if (validDistances[i]) {
377 map<string, string> temp;
378 temp[tempDistFile] = tempNameFile;
379 dists.push_back(temp);
382 m->openInputFile(tempNameFile, in);
385 in >> name >> nameList; m->gobble(in);
387 remainingNames << name << '\t' << nameList << endl;
390 m->mothurRemove(tempNameFile);
394 remainingNames.close();
397 m->mothurRemove(singleton);
401 if (m->control_pressed) {
402 for (int i = 0; i < dists.size(); i++) {
403 m->mothurRemove((dists[i].begin()->first));
404 m->mothurRemove((dists[i].begin()->second));
411 catch(exception& e) {
412 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
416 /***********************************************************************/
417 int SplitMatrix::splitDistanceLarge(){
419 vector<set<string> > groups;
421 //for buffering the io to improve speed
422 //allow for 30 dists to be stored, then output.
423 vector<string> outputs;
424 vector<int> numOutputs;
425 vector<bool> wroteOutPut;
431 m->openInputFile(distFile, dFile);
437 dFile >> seqA >> seqB >> dist;
439 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; }
442 //cout << "in cutoff: " << dist << endl;
447 for(int i=0;i<numGroups;i++){
448 set<string>::iterator aIt = groups[i].find(seqA);
449 set<string>::iterator bIt = groups[i].find(seqB);
451 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
452 groups[i].insert(seqB);
456 //cout << "in aIt: " << groupID << endl;
459 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
460 groups[i].insert(seqA);
464 // cout << "in bIt: " << groupID << endl;
468 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
469 if(groupIDA < groupIDB){
470 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
471 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
472 groups[groupIDB].clear();
476 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
477 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
478 groups[groupIDA].clear();
485 //windows is gonna gag on the reuse of outFile, will need to make it local...
487 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
488 set<string> newGroup;
489 newGroup.insert(seqA);
490 newGroup.insert(seqB);
491 groups.push_back(newGroup);
493 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
494 outputs.push_back(tempOut);
495 numOutputs.push_back(1);
496 wroteOutPut.push_back(false);
501 string fileName = distFile + "." + toString(groupID) + ".temp";
503 //have we reached the max buffer size
504 if (numOutputs[groupID] > 60) { //write out sequence
505 outFile.open(fileName.c_str(), ios::app);
506 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
509 outputs[groupID] = "";
510 numOutputs[groupID] = 0;
511 wroteOutPut[groupID] = true;
513 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
514 numOutputs[groupID]++;
517 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
518 string row, column, distance;
519 if(groupIDA<groupIDB){
522 numOutputs[groupID] += numOutputs[groupIDB];
523 outputs[groupID] += outputs[groupIDB];
525 outputs[groupIDB] = "";
526 numOutputs[groupIDB] = 0;
528 //if groupB is written to file it is above buffer size so read and write to new merged file
529 if (wroteOutPut[groupIDB]) {
530 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
531 ifstream fileB(fileName2.c_str(), ios::ate);
533 outFile.open(fileName.c_str(), ios::app);
538 size = fileB.tellg();
540 fileB.seekg (0, ios::beg);
542 int numRead = size / 1024;
543 int lastRead = size % 1024;
545 for (int i = 0; i < numRead; i++) {
547 memblock = new char [1024];
549 fileB.read (memblock, 1024);
551 string temp = memblock;
552 outFile << temp.substr(0, 1024);
557 memblock = new char [lastRead];
559 fileB.read (memblock, lastRead);
561 //not sure why but it will read more than lastRead char...??
562 string temp = memblock;
563 outFile << temp.substr(0, lastRead);
567 m->mothurRemove(fileName2);
569 //write out the merged memory
570 if (numOutputs[groupID] > 60) {
571 outFile << outputs[groupID];
572 outputs[groupID] = "";
573 numOutputs[groupID] = 0;
578 wroteOutPut[groupID] = true;
579 wroteOutPut[groupIDB] = false;
580 }else{ } //just merge b's memory with a's memory
583 numOutputs[groupID] += numOutputs[groupIDA];
584 outputs[groupID] += outputs[groupIDA];
586 outputs[groupIDA] = "";
587 numOutputs[groupIDA] = 0;
589 if (wroteOutPut[groupIDA]) {
590 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
591 ifstream fileB(fileName2.c_str(), ios::ate);
593 outFile.open(fileName.c_str(), ios::app);
598 size = fileB.tellg();
600 fileB.seekg (0, ios::beg);
602 int numRead = size / 1024;
603 int lastRead = size % 1024;
605 for (int i = 0; i < numRead; i++) {
607 memblock = new char [1024];
609 fileB.read (memblock, 1024);
610 string temp = memblock;
611 outFile << temp.substr(0, 1024);
616 memblock = new char [lastRead];
618 fileB.read (memblock, lastRead);
620 //not sure why but it will read more than lastRead char...??
621 string temp = memblock;
622 outFile << temp.substr(0, lastRead);
627 m->mothurRemove(fileName2);
629 //write out the merged memory
630 if (numOutputs[groupID] > 60) {
631 outFile << outputs[groupID];
632 outputs[groupID] = "";
633 numOutputs[groupID] = 0;
638 wroteOutPut[groupID] = true;
639 wroteOutPut[groupIDA] = false;
640 }else { } //just merge memory
649 for (int i = 0; i < numGroups; i++) {
650 if (numOutputs[i] > 0) {
651 string fileName = distFile + "." + toString(i) + ".temp";
652 outFile.open(fileName.c_str(), ios::app);
653 outFile << outputs[i];
662 catch(exception& e) {
663 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
667 //********************************************************************************************************************
668 int SplitMatrix::splitNames(vector<set<string> >& groups){
670 int numGroups = groups.size();
672 ifstream bigNameFile(namefile.c_str());
674 cerr << "Error: We can't open the name file\n";
678 map<string, string> nameMap;
679 string name, nameList;
681 bigNameFile >> name >> nameList;
682 nameMap[name] = nameList;
683 m->gobble(bigNameFile);
687 for(int i=0;i<numGroups;i++){ //parse names file to match distance files
688 int numSeqsInGroup = groups[i].size();
690 if(numSeqsInGroup > 0){
691 string fileName = namefile + "." + toString(i) + ".temp";
692 ofstream smallNameFile(fileName.c_str(), ios::ate);
694 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
695 map<string,string>::iterator nIt = nameMap.find(*gIt);
696 if (nIt != nameMap.end()) {
697 smallNameFile << nIt->first << '\t' << nIt->second << endl;
700 m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1);
703 smallNameFile.close();
707 //names of singletons
708 if (nameMap.size() != 0) {
709 singleton = namefile + ".extra.temp";
710 ofstream remainingNames(singleton.c_str(), ios::ate);
711 for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
712 remainingNames << nIt->first << '\t' << nIt->second << endl;
714 remainingNames.close();
715 }else { singleton = "none"; }
717 for(int i=0;i<numGroups;i++){
718 if(groups[i].size() > 0){
719 string tempNameFile = namefile + "." + toString(i) + ".temp";
720 string tempDistFile = distFile + "." + toString(i) + ".temp";
722 map<string, string> temp;
723 temp[tempDistFile] = tempNameFile;
724 dists.push_back(temp);
728 if (m->control_pressed) {
729 for (int i = 0; i < dists.size(); i++) {
730 m->mothurRemove((dists[i].begin()->first));
731 m->mothurRemove((dists[i].begin()->second));
738 catch(exception& e) {
739 m->errorOut(e, "SplitMatrix", "splitNames");
743 //********************************************************************************************************************
744 int SplitMatrix::splitDistanceRAM(){
746 vector<set<string> > groups;
747 vector<string> outputs;
752 m->openInputFile(distFile, dFile);
758 dFile >> seqA >> seqB >> dist;
760 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; }
763 //cout << "in cutoff: " << dist << endl;
768 for(int i=0;i<numGroups;i++){
769 set<string>::iterator aIt = groups[i].find(seqA);
770 set<string>::iterator bIt = groups[i].find(seqB);
772 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
773 groups[i].insert(seqB);
777 //cout << "in aIt: " << groupID << endl;
780 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
781 groups[i].insert(seqA);
785 // cout << "in bIt: " << groupID << endl;
789 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
790 if(groupIDA < groupIDB){
791 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
792 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
793 groups[groupIDB].clear();
797 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
798 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
799 groups[groupIDA].clear();
806 //windows is gonna gag on the reuse of outFile, will need to make it local...
808 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
809 set<string> newGroup;
810 newGroup.insert(seqA);
811 newGroup.insert(seqB);
812 groups.push_back(newGroup);
814 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
815 outputs.push_back(tempOut);
820 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
822 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
823 string row, column, distance;
824 if(groupIDA<groupIDB){
826 outputs[groupID] += outputs[groupIDB];
827 outputs[groupIDB] = "";
829 outputs[groupID] += outputs[groupIDA];
830 outputs[groupIDA] = "";
839 for (int i = 0; i < numGroups; i++) {
840 if (outputs[i] != "") {
842 string fileName = distFile + "." + toString(i) + ".temp";
843 outFile.open(fileName.c_str(), ios::ate);
844 outFile << outputs[i];
853 catch(exception& e) {
854 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
858 //********************************************************************************************************************
859 //sorts biggest to smallest
860 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
865 //get num bytes in file
866 string filename = left.begin()->first;
867 pFile = fopen (filename.c_str(),"rb");
868 string error = "Error opening " + filename;
869 if (pFile==NULL) perror (error.c_str());
871 fseek (pFile, 0, SEEK_END);
872 leftsize=ftell (pFile);
879 //get num bytes in file
880 filename = right.begin()->first;
881 pFile2 = fopen (filename.c_str(),"rb");
882 error = "Error opening " + filename;
883 if (pFile2==NULL) perror (error.c_str());
885 fseek (pFile2, 0, SEEK_END);
886 rightsize=ftell (pFile2);
890 return (leftsize > rightsize);
892 /***********************************************************************/
893 //returns map of distance files -> namefile sorted by distance file size
894 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
897 sort(dists.begin(), dists.end(), compareFileSizes);
901 catch(exception& e) {
902 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
906 /***********************************************************************/
907 SplitMatrix::~SplitMatrix(){}
908 /***********************************************************************/