5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
15 /***********************************************************************/
17 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
18 m = MothurOut::getInstance();
26 /***********************************************************************/
28 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, string output){
29 m = MothurOut::getInstance();
33 cutoff = c; //tax level cutoff
34 distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
40 /***********************************************************************/
42 int SplitMatrix::split(){
45 if (method == "distance") {
47 }else if ((method == "classify") || (method == "fasta")) {
50 m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
51 map<string, string> temp;
52 temp[distFile] = namefile;
53 dists.push_back(temp);
59 m->errorOut(e, "SplitMatrix", "split");
63 /***********************************************************************/
64 int SplitMatrix::splitDistance(){
67 if (large) { splitDistanceLarge(); }
68 else { splitDistanceRAM(); }
72 m->errorOut(e, "SplitMatrix", "splitDistance");
77 /***********************************************************************/
78 int SplitMatrix::splitClassify(){
82 map<string, int> seqGroup;
83 map<string, int>::iterator it;
84 map<string, int>::iterator it2;
88 //build tree from users taxonomy file
89 PhyloTree* phylo = new PhyloTree();
92 m->openInputFile(taxFile, in);
94 //read in users taxonomy file and add sequences to tree
97 in >> seqname >> tax; m->gobble(in);
98 phylo->addSeqToTree(seqname, tax);
102 phylo->assignHeirarchyIDs(0);
104 //make sure the cutoff is not greater than maxlevel
105 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
107 //for each node in tree
108 for (int i = 0; i < phylo->getNumNodes(); i++) {
110 //is this node within the cutoff
111 TaxNode taxon = phylo->get(i);
113 if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
114 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
115 for (int j = 0; j < taxon.accessions.size(); j++) {
116 seqGroup[taxon.accessions[j]] = numGroups;
125 if (method == "classify") {
126 splitDistanceFileByTax(seqGroup, numGroups);
128 createDistanceFilesFromTax(seqGroup, numGroups);
134 catch(exception& e) {
135 m->errorOut(e, "SplitMatrix", "splitClassify");
139 /***********************************************************************/
140 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
142 map<string, int> copyGroups = seqGroup;
143 map<string, int>::iterator it;
146 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
147 remove((fastafile + "." + toString(i) + ".temp").c_str());
151 m->openInputFile(fastafile, in);
156 Sequence query(in); m->gobble(in);
157 if (query.getName() != "") {
159 it = seqGroup.find(query.getName());
161 //save names in case no namefile is given
162 if (namefile == "") { names.insert(query.getName()); }
164 if (it != seqGroup.end()) { //not singleton
165 m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
166 query.printSequence(outFile);
169 copyGroups.erase(query.getName());
175 //warn about sequence in groups that are not in fasta file
176 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
177 m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
183 //process each distance file
184 for (int i = 0; i < numGroups; i++) {
186 string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff);
188 Command* command = new DistanceCommand(options);
193 remove((fastafile + "." + toString(i) + ".temp").c_str());
195 //remove old names files just in case
196 remove((namefile + "." + toString(i) + ".temp").c_str());
199 singleton = namefile + ".extra.temp";
200 ofstream remainingNames;
201 m->openOutputFile(singleton, remainingNames);
203 bool wroteExtra = false;
205 ifstream bigNameFile;
206 m->openInputFile(namefile, bigNameFile);
208 string name, nameList;
209 while(!bigNameFile.eof()){
210 bigNameFile >> name >> nameList; m->gobble(bigNameFile);
212 //did this sequence get assigned a group
213 it = seqGroup.find(name);
215 if (it != seqGroup.end()) {
216 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
217 outFile << name << '\t' << nameList << endl;
221 remainingNames << name << '\t' << nameList << endl;
226 for(int i=0;i<numGroups;i++){
227 string tempNameFile = namefile + "." + toString(i) + ".temp";
228 if (outputDir == "") { outputDir = m->hasPath(fastafile); }
229 string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
231 //if there are valid distances
233 fileHandle.open(tempDistFile.c_str());
235 m->gobble(fileHandle);
236 if (!fileHandle.eof()) { //check for blank file - this could occur if all dists in group are above cutoff
237 map<string, string> temp;
238 temp[tempDistFile] = tempNameFile;
239 dists.push_back(temp);
242 m->openInputFile(tempNameFile, in);
245 in >> name >> nameList; m->gobble(in);
247 remainingNames << name << '\t' << nameList << endl;
250 remove(tempNameFile.c_str());
256 remainingNames.close();
258 remove(singleton.c_str());
262 if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); }
266 catch(exception& e) {
267 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
271 /***********************************************************************/
272 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
274 map<string, int>::iterator it;
275 map<string, int>::iterator it2;
278 m->openInputFile(distFile, dFile);
281 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
282 remove((distFile + "." + toString(i) + ".temp").c_str());
285 //for buffering the io to improve speed
286 //allow for 10 dists to be stored, then output.
287 vector<string> outputs; outputs.resize(numGroups, "");
288 vector<int> numOutputs; numOutputs.resize(numGroups, 0);
290 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
291 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
292 vector<bool> validDistances; validDistances.resize(numGroups, false);
299 if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } }
301 dFile >> seqA >> seqB >> dist; m->gobble(dFile);
303 //if both sequences are in the same group then they are within the cutoff
304 it = seqGroup.find(seqA);
305 it2 = seqGroup.find(seqB);
307 if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
308 if (it->second == it2->second) { //they are from the same group so add the distance
309 if (numOutputs[it->second] > 30) {
310 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
311 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
313 outputs[it->second] = "";
314 numOutputs[it->second] = 0;
315 validDistances[it->second] = true;
317 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
318 numOutputs[it->second]++;
325 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
326 remove((namefile + "." + toString(i) + ".temp").c_str());
328 //write out any remaining buffers
329 if (numOutputs[i] > 0) {
330 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
331 outFile << outputs[i];
335 validDistances[i] = true;
339 ifstream bigNameFile;
340 m->openInputFile(namefile, bigNameFile);
342 singleton = namefile + ".extra.temp";
343 ofstream remainingNames;
344 m->openOutputFile(singleton, remainingNames);
346 bool wroteExtra = false;
348 string name, nameList;
349 while(!bigNameFile.eof()){
350 bigNameFile >> name >> nameList; m->gobble(bigNameFile);
352 //did this sequence get assigned a group
353 it = seqGroup.find(name);
355 if (it != seqGroup.end()) {
356 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
357 outFile << name << '\t' << nameList << endl;
361 remainingNames << name << '\t' << nameList << endl;
366 for(int i=0;i<numGroups;i++){
367 string tempNameFile = namefile + "." + toString(i) + ".temp";
368 string tempDistFile = distFile + "." + toString(i) + ".temp";
370 //if there are valid distances
371 if (validDistances[i]) {
372 map<string, string> temp;
373 temp[tempDistFile] = tempNameFile;
374 dists.push_back(temp);
377 m->openInputFile(tempNameFile, in);
380 in >> name >> nameList; m->gobble(in);
382 remainingNames << name << '\t' << nameList << endl;
385 remove(tempNameFile.c_str());
389 remainingNames.close();
392 remove(singleton.c_str());
396 if (m->control_pressed) {
397 for (int i = 0; i < dists.size(); i++) {
398 remove((dists[i].begin()->first).c_str());
399 remove((dists[i].begin()->second).c_str());
406 catch(exception& e) {
407 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
411 /***********************************************************************/
412 int SplitMatrix::splitDistanceLarge(){
414 vector<set<string> > groups;
416 //for buffering the io to improve speed
417 //allow for 30 dists to be stored, then output.
418 vector<string> outputs;
419 vector<int> numOutputs;
420 vector<bool> wroteOutPut;
426 m->openInputFile(distFile, dFile);
432 dFile >> seqA >> seqB >> dist;
434 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
437 //cout << "in cutoff: " << dist << endl;
442 for(int i=0;i<numGroups;i++){
443 set<string>::iterator aIt = groups[i].find(seqA);
444 set<string>::iterator bIt = groups[i].find(seqB);
446 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
447 groups[i].insert(seqB);
451 //cout << "in aIt: " << groupID << endl;
454 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
455 groups[i].insert(seqA);
459 // cout << "in bIt: " << groupID << endl;
463 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
464 if(groupIDA < groupIDB){
465 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
466 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
467 groups[groupIDB].clear();
471 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
472 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
473 groups[groupIDA].clear();
480 //windows is gonna gag on the reuse of outFile, will need to make it local...
482 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
483 set<string> newGroup;
484 newGroup.insert(seqA);
485 newGroup.insert(seqB);
486 groups.push_back(newGroup);
488 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
489 outputs.push_back(tempOut);
490 numOutputs.push_back(1);
491 wroteOutPut.push_back(false);
496 string fileName = distFile + "." + toString(groupID) + ".temp";
498 //have we reached the max buffer size
499 if (numOutputs[groupID] > 60) { //write out sequence
500 outFile.open(fileName.c_str(), ios::app);
501 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
504 outputs[groupID] = "";
505 numOutputs[groupID] = 0;
506 wroteOutPut[groupID] = true;
508 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
509 numOutputs[groupID]++;
512 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
513 string row, column, distance;
514 if(groupIDA<groupIDB){
517 numOutputs[groupID] += numOutputs[groupIDB];
518 outputs[groupID] += outputs[groupIDB];
520 outputs[groupIDB] = "";
521 numOutputs[groupIDB] = 0;
523 //if groupB is written to file it is above buffer size so read and write to new merged file
524 if (wroteOutPut[groupIDB]) {
525 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
526 ifstream fileB(fileName2.c_str(), ios::ate);
528 outFile.open(fileName.c_str(), ios::app);
533 size = fileB.tellg();
535 fileB.seekg (0, ios::beg);
537 int numRead = size / 1024;
538 int lastRead = size % 1024;
540 for (int i = 0; i < numRead; i++) {
542 memblock = new char [1024];
544 fileB.read (memblock, 1024);
546 string temp = memblock;
547 outFile << temp.substr(0, 1024);
552 memblock = new char [lastRead];
554 fileB.read (memblock, lastRead);
556 //not sure why but it will read more than lastRead char...??
557 string temp = memblock;
558 outFile << temp.substr(0, lastRead);
562 remove(fileName2.c_str());
564 //write out the merged memory
565 if (numOutputs[groupID] > 60) {
566 outFile << outputs[groupID];
567 outputs[groupID] = "";
568 numOutputs[groupID] = 0;
573 wroteOutPut[groupID] = true;
574 wroteOutPut[groupIDB] = false;
575 }else{ } //just merge b's memory with a's memory
578 numOutputs[groupID] += numOutputs[groupIDA];
579 outputs[groupID] += outputs[groupIDA];
581 outputs[groupIDA] = "";
582 numOutputs[groupIDA] = 0;
584 if (wroteOutPut[groupIDA]) {
585 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
586 ifstream fileB(fileName2.c_str(), ios::ate);
588 outFile.open(fileName.c_str(), ios::app);
593 size = fileB.tellg();
595 fileB.seekg (0, ios::beg);
597 int numRead = size / 1024;
598 int lastRead = size % 1024;
600 for (int i = 0; i < numRead; i++) {
602 memblock = new char [1024];
604 fileB.read (memblock, 1024);
605 string temp = memblock;
606 outFile << temp.substr(0, 1024);
611 memblock = new char [lastRead];
613 fileB.read (memblock, lastRead);
615 //not sure why but it will read more than lastRead char...??
616 string temp = memblock;
617 outFile << temp.substr(0, lastRead);
622 remove(fileName2.c_str());
624 //write out the merged memory
625 if (numOutputs[groupID] > 60) {
626 outFile << outputs[groupID];
627 outputs[groupID] = "";
628 numOutputs[groupID] = 0;
633 wroteOutPut[groupID] = true;
634 wroteOutPut[groupIDA] = false;
635 }else { } //just merge memory
644 for (int i = 0; i < numGroups; i++) {
645 if (numOutputs[i] > 0) {
646 string fileName = distFile + "." + toString(i) + ".temp";
647 outFile.open(fileName.c_str(), ios::app);
648 outFile << outputs[i];
657 catch(exception& e) {
658 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
662 //********************************************************************************************************************
663 int SplitMatrix::splitNames(vector<set<string> >& groups){
665 int numGroups = groups.size();
667 ifstream bigNameFile(namefile.c_str());
669 cerr << "Error: We can't open the name file\n";
673 map<string, string> nameMap;
674 string name, nameList;
676 bigNameFile >> name >> nameList;
677 nameMap[name] = nameList;
678 m->gobble(bigNameFile);
682 for(int i=0;i<numGroups;i++){ //parse names file to match distance files
683 int numSeqsInGroup = groups[i].size();
685 if(numSeqsInGroup > 0){
686 string fileName = namefile + "." + toString(i) + ".temp";
687 ofstream smallNameFile(fileName.c_str(), ios::ate);
689 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
690 map<string,string>::iterator nIt = nameMap.find(*gIt);
691 if (nIt != nameMap.end()) {
692 smallNameFile << nIt->first << '\t' << nIt->second << endl;
695 m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1);
698 smallNameFile.close();
702 //names of singletons
703 if (nameMap.size() != 0) {
704 singleton = namefile + ".extra.temp";
705 ofstream remainingNames(singleton.c_str(), ios::ate);
706 for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
707 remainingNames << nIt->first << '\t' << nIt->second << endl;
709 remainingNames.close();
710 }else { singleton = "none"; }
712 for(int i=0;i<numGroups;i++){
713 if(groups[i].size() > 0){
714 string tempNameFile = namefile + "." + toString(i) + ".temp";
715 string tempDistFile = distFile + "." + toString(i) + ".temp";
717 map<string, string> temp;
718 temp[tempDistFile] = tempNameFile;
719 dists.push_back(temp);
723 if (m->control_pressed) {
724 for (int i = 0; i < dists.size(); i++) {
725 remove((dists[i].begin()->first).c_str());
726 remove((dists[i].begin()->second).c_str());
733 catch(exception& e) {
734 m->errorOut(e, "SplitMatrix", "splitNames");
738 //********************************************************************************************************************
739 int SplitMatrix::splitDistanceRAM(){
741 vector<set<string> > groups;
742 vector<string> outputs;
747 m->openInputFile(distFile, dFile);
753 dFile >> seqA >> seqB >> dist;
755 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
758 //cout << "in cutoff: " << dist << endl;
763 for(int i=0;i<numGroups;i++){
764 set<string>::iterator aIt = groups[i].find(seqA);
765 set<string>::iterator bIt = groups[i].find(seqB);
767 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
768 groups[i].insert(seqB);
772 //cout << "in aIt: " << groupID << endl;
775 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
776 groups[i].insert(seqA);
780 // cout << "in bIt: " << groupID << endl;
784 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
785 if(groupIDA < groupIDB){
786 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
787 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
788 groups[groupIDB].clear();
792 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
793 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
794 groups[groupIDA].clear();
801 //windows is gonna gag on the reuse of outFile, will need to make it local...
803 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
804 set<string> newGroup;
805 newGroup.insert(seqA);
806 newGroup.insert(seqB);
807 groups.push_back(newGroup);
809 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
810 outputs.push_back(tempOut);
815 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
817 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
818 string row, column, distance;
819 if(groupIDA<groupIDB){
821 outputs[groupID] += outputs[groupIDB];
822 outputs[groupIDB] = "";
824 outputs[groupID] += outputs[groupIDA];
825 outputs[groupIDA] = "";
834 for (int i = 0; i < numGroups; i++) {
835 if (outputs[i] != "") {
837 string fileName = distFile + "." + toString(i) + ".temp";
838 outFile.open(fileName.c_str(), ios::ate);
839 outFile << outputs[i];
848 catch(exception& e) {
849 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
853 //********************************************************************************************************************
854 //sorts biggest to smallest
855 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
860 //get num bytes in file
861 string filename = left.begin()->first;
862 pFile = fopen (filename.c_str(),"rb");
863 string error = "Error opening " + filename;
864 if (pFile==NULL) perror (error.c_str());
866 fseek (pFile, 0, SEEK_END);
867 leftsize=ftell (pFile);
874 //get num bytes in file
875 filename = right.begin()->first;
876 pFile2 = fopen (filename.c_str(),"rb");
877 error = "Error opening " + filename;
878 if (pFile2==NULL) perror (error.c_str());
880 fseek (pFile2, 0, SEEK_END);
881 rightsize=ftell (pFile2);
885 return (leftsize > rightsize);
887 /***********************************************************************/
888 //returns map of distance files -> namefile sorted by distance file size
889 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
892 sort(dists.begin(), dists.end(), compareFileSizes);
896 catch(exception& e) {
897 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
901 /***********************************************************************/
902 SplitMatrix::~SplitMatrix(){}
903 /***********************************************************************/