5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
14 /***********************************************************************/
16 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
17 m = MothurOut::getInstance();
25 /***********************************************************************/
27 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, string t, int p){
28 m = MothurOut::getInstance();
37 /***********************************************************************/
39 int SplitMatrix::split(){
42 if (method == "distance") {
44 }else if ((method == "classify") || (method == "fasta")) {
47 m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
48 map<string, string> temp;
49 temp[distFile] = namefile;
50 dists.push_back(temp);
56 m->errorOut(e, "SplitMatrix", "split");
60 /***********************************************************************/
61 int SplitMatrix::splitDistance(){
64 if (large) { splitDistanceLarge(); }
65 else { splitDistanceRAM(); }
69 m->errorOut(e, "SplitMatrix", "splitDistance");
74 /***********************************************************************/
75 int SplitMatrix::splitClassify(){
79 map<string, int> seqGroup;
80 map<string, int>::iterator it;
81 map<string, int>::iterator it2;
85 //build tree from users taxonomy file
86 PhyloTree* phylo = new PhyloTree();
89 openInputFile(taxFile, in);
91 //read in users taxonomy file and add sequences to tree
94 in >> seqname >> tax; gobble(in);
95 phylo->addSeqToTree(seqname, tax);
99 phylo->assignHeirarchyIDs(0);
101 //make sure the cutoff is not greater than maxlevel
102 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
104 //for each node in tree
105 for (int i = 0; i < phylo->getNumNodes(); i++) {
107 //is this node within the cutoff
108 TaxNode taxon = phylo->get(i);
110 if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
111 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
112 for (int j = 0; j < taxon.accessions.size(); j++) {
113 seqGroup[taxon.accessions[j]] = numGroups;
122 if (method == "classify") {
123 splitDistanceFileByTax(seqGroup, numGroups);
125 createDistanceFilesFromTax(seqGroup, numGroups);
131 catch(exception& e) {
132 m->errorOut(e, "SplitMatrix", "splitClassify");
136 /***********************************************************************/
137 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
139 map<string, int> copyGroups = seqGroup;
140 map<string, int>::iterator it;
143 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
144 remove((fastafile + "." + toString(i) + ".temp").c_str());
148 openInputFile(fastafile, in);
153 Sequence query(in); gobble(in);
154 if (query.getName() != "") {
156 it = seqGroup.find(query.getName());
158 //save names in case no namefile is given
159 if (namefile == "") { names.insert(query.getName()); }
161 if (it != seqGroup.end()) { //not singleton
162 openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
163 query.printSequence(outFile);
166 copyGroups.erase(it);
172 //warn about sequence in groups that are not in fasta file
173 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
174 m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
180 //process each distance file
181 for (int i = 0; i < numGroups; i++) {
183 string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors);
185 Command* command = new DistanceCommand(options);
189 remove((fastafile + "." + toString(i) + ".temp").c_str());
191 //remove old names files just in case
192 remove((namefile + "." + toString(i) + ".temp").c_str());
195 singleton = namefile + ".extra.temp";
196 ofstream remainingNames;
197 openOutputFile(singleton, remainingNames);
199 bool wroteExtra = false;
201 ifstream bigNameFile;
202 openInputFile(namefile, bigNameFile);
204 string name, nameList;
205 while(!bigNameFile.eof()){
206 bigNameFile >> name >> nameList; gobble(bigNameFile);
208 //did this sequence get assigned a group
209 it = seqGroup.find(name);
211 if (it != seqGroup.end()) {
212 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
213 outFile << name << '\t' << nameList << endl;
217 remainingNames << name << '\t' << nameList << endl;
222 remainingNames.close();
224 remove(singleton.c_str());
228 for(int i=0;i<numGroups;i++){
229 string tempNameFile = namefile + "." + toString(i) + ".temp";
230 string tempDistFile = getRootName(getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
232 //if there are valid distances
234 fileHandle.open(tempDistFile.c_str());
237 if (!fileHandle.eof()) { //check for blank file
238 map<string, string> temp;
239 temp[tempDistFile] = tempNameFile;
240 dists.push_back(temp);
246 if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); }
250 catch(exception& e) {
251 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
255 /***********************************************************************/
256 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
258 map<string, int>::iterator it;
259 map<string, int>::iterator it2;
262 openInputFile(distFile, dFile);
265 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
266 remove((distFile + "." + toString(i) + ".temp").c_str());
269 //for buffering the io to improve speed
270 //allow for 10 dists to be stored, then output.
271 vector<string> outputs; outputs.resize(numGroups, "");
272 vector<int> numOutputs; numOutputs.resize(numGroups, 0);
274 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
275 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
276 vector<bool> validDistances; validDistances.resize(numGroups, false);
283 if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } }
285 dFile >> seqA >> seqB >> dist; gobble(dFile);
287 //if both sequences are in the same group then they are within the cutoff
288 it = seqGroup.find(seqA);
289 it2 = seqGroup.find(seqB);
291 if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
292 if (it->second == it2->second) { //they are from the same group so add the distance
293 if (numOutputs[it->second] > 30) {
294 openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
295 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
297 outputs[it->second] = "";
298 numOutputs[it->second] = 0;
299 validDistances[it->second] = true;
301 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
302 numOutputs[it->second]++;
309 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
310 remove((namefile + "." + toString(i) + ".temp").c_str());
312 //write out any remaining buffers
313 if (numOutputs[i] > 0) {
314 openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
315 outFile << outputs[i];
319 validDistances[i] = true;
323 ifstream bigNameFile;
324 openInputFile(namefile, bigNameFile);
326 singleton = namefile + ".extra.temp";
327 ofstream remainingNames;
328 openOutputFile(singleton, remainingNames);
330 bool wroteExtra = false;
332 string name, nameList;
333 while(!bigNameFile.eof()){
334 bigNameFile >> name >> nameList; gobble(bigNameFile);
336 //did this sequence get assigned a group
337 it = seqGroup.find(name);
339 if (it != seqGroup.end()) {
340 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
341 outFile << name << '\t' << nameList << endl;
345 remainingNames << name << '\t' << nameList << endl;
350 for(int i=0;i<numGroups;i++){
351 string tempNameFile = namefile + "." + toString(i) + ".temp";
352 string tempDistFile = distFile + "." + toString(i) + ".temp";
354 //if there are valid distances
355 if (validDistances[i]) {
356 map<string, string> temp;
357 temp[tempDistFile] = tempNameFile;
358 dists.push_back(temp);
361 openInputFile(tempNameFile, in);
364 in >> name >> nameList; gobble(in);
366 remainingNames << name << '\t' << nameList << endl;
369 remove(tempNameFile.c_str());
373 remainingNames.close();
376 remove(singleton.c_str());
380 if (m->control_pressed) {
381 for (int i = 0; i < dists.size(); i++) {
382 remove((dists[i].begin()->first).c_str());
383 remove((dists[i].begin()->second).c_str());
390 catch(exception& e) {
391 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
395 /***********************************************************************/
396 int SplitMatrix::splitDistanceLarge(){
398 vector<set<string> > groups;
400 //for buffering the io to improve speed
401 //allow for 30 dists to be stored, then output.
402 vector<string> outputs;
403 vector<int> numOutputs;
404 vector<bool> wroteOutPut;
410 openInputFile(distFile, dFile);
416 dFile >> seqA >> seqB >> dist;
418 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
421 //cout << "in cutoff: " << dist << endl;
426 for(int i=0;i<numGroups;i++){
427 set<string>::iterator aIt = groups[i].find(seqA);
428 set<string>::iterator bIt = groups[i].find(seqB);
430 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
431 groups[i].insert(seqB);
435 //cout << "in aIt: " << groupID << endl;
438 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
439 groups[i].insert(seqA);
443 // cout << "in bIt: " << groupID << endl;
447 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
448 if(groupIDA < groupIDB){
449 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
450 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
451 groups[groupIDB].clear();
455 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
456 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
457 groups[groupIDA].clear();
464 //windows is gonna gag on the reuse of outFile, will need to make it local...
466 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
467 set<string> newGroup;
468 newGroup.insert(seqA);
469 newGroup.insert(seqB);
470 groups.push_back(newGroup);
472 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
473 outputs.push_back(tempOut);
474 numOutputs.push_back(1);
475 wroteOutPut.push_back(false);
480 string fileName = distFile + "." + toString(groupID) + ".temp";
482 //have we reached the max buffer size
483 if (numOutputs[groupID] > 60) { //write out sequence
484 outFile.open(fileName.c_str(), ios::app);
485 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
488 outputs[groupID] = "";
489 numOutputs[groupID] = 0;
490 wroteOutPut[groupID] = true;
492 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
493 numOutputs[groupID]++;
496 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
497 string row, column, distance;
498 if(groupIDA<groupIDB){
501 numOutputs[groupID] += numOutputs[groupIDB];
502 outputs[groupID] += outputs[groupIDB];
504 outputs[groupIDB] = "";
505 numOutputs[groupIDB] = 0;
507 //if groupB is written to file it is above buffer size so read and write to new merged file
508 if (wroteOutPut[groupIDB]) {
509 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
510 ifstream fileB(fileName2.c_str(), ios::ate);
512 outFile.open(fileName.c_str(), ios::app);
517 size = fileB.tellg();
519 fileB.seekg (0, ios::beg);
521 int numRead = size / 1024;
522 int lastRead = size % 1024;
524 for (int i = 0; i < numRead; i++) {
526 memblock = new char [1024];
528 fileB.read (memblock, 1024);
530 string temp = memblock;
531 outFile << temp.substr(0, 1024);
536 memblock = new char [lastRead];
538 fileB.read (memblock, lastRead);
540 //not sure why but it will read more than lastRead char...??
541 string temp = memblock;
542 outFile << temp.substr(0, lastRead);
546 remove(fileName2.c_str());
548 //write out the merged memory
549 if (numOutputs[groupID] > 60) {
550 outFile << outputs[groupID];
551 outputs[groupID] = "";
552 numOutputs[groupID] = 0;
557 wroteOutPut[groupID] = true;
558 wroteOutPut[groupIDB] = false;
559 }else{ } //just merge b's memory with a's memory
562 numOutputs[groupID] += numOutputs[groupIDA];
563 outputs[groupID] += outputs[groupIDA];
565 outputs[groupIDA] = "";
566 numOutputs[groupIDA] = 0;
568 if (wroteOutPut[groupIDA]) {
569 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
570 ifstream fileB(fileName2.c_str(), ios::ate);
572 outFile.open(fileName.c_str(), ios::app);
577 size = fileB.tellg();
579 fileB.seekg (0, ios::beg);
581 int numRead = size / 1024;
582 int lastRead = size % 1024;
584 for (int i = 0; i < numRead; i++) {
586 memblock = new char [1024];
588 fileB.read (memblock, 1024);
589 string temp = memblock;
590 outFile << temp.substr(0, 1024);
595 memblock = new char [lastRead];
597 fileB.read (memblock, lastRead);
599 //not sure why but it will read more than lastRead char...??
600 string temp = memblock;
601 outFile << temp.substr(0, lastRead);
606 remove(fileName2.c_str());
608 //write out the merged memory
609 if (numOutputs[groupID] > 60) {
610 outFile << outputs[groupID];
611 outputs[groupID] = "";
612 numOutputs[groupID] = 0;
617 wroteOutPut[groupID] = true;
618 wroteOutPut[groupIDA] = false;
619 }else { } //just merge memory
628 for (int i = 0; i < numGroups; i++) {
629 if (numOutputs[i] > 0) {
630 string fileName = distFile + "." + toString(i) + ".temp";
631 outFile.open(fileName.c_str(), ios::app);
632 outFile << outputs[i];
641 catch(exception& e) {
642 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
646 //********************************************************************************************************************
647 int SplitMatrix::splitNames(vector<set<string> >& groups){
649 int numGroups = groups.size();
651 ifstream bigNameFile(namefile.c_str());
653 cerr << "Error: We can't open the name file\n";
657 map<string, string> nameMap;
658 string name, nameList;
660 bigNameFile >> name >> nameList;
661 nameMap[name] = nameList;
666 for(int i=0;i<numGroups;i++){ //parse names file to match distance files
667 int numSeqsInGroup = groups[i].size();
669 if(numSeqsInGroup > 0){
670 string fileName = namefile + "." + toString(i) + ".temp";
671 ofstream smallNameFile(fileName.c_str(), ios::ate);
673 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
674 map<string,string>::iterator nIt = nameMap.find(*gIt);
675 if (nIt != nameMap.end()) {
676 smallNameFile << nIt->first << '\t' << nIt->second << endl;
679 m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1);
682 smallNameFile.close();
686 //names of singletons
687 if (nameMap.size() != 0) {
688 singleton = namefile + ".extra.temp";
689 ofstream remainingNames(singleton.c_str(), ios::ate);
690 for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
691 remainingNames << nIt->first << '\t' << nIt->second << endl;
693 remainingNames.close();
694 }else { singleton = "none"; }
696 for(int i=0;i<numGroups;i++){
697 if(groups[i].size() > 0){
698 string tempNameFile = namefile + "." + toString(i) + ".temp";
699 string tempDistFile = distFile + "." + toString(i) + ".temp";
701 map<string, string> temp;
702 temp[tempDistFile] = tempNameFile;
703 dists.push_back(temp);
707 if (m->control_pressed) {
708 for (int i = 0; i < dists.size(); i++) {
709 remove((dists[i].begin()->first).c_str());
710 remove((dists[i].begin()->second).c_str());
717 catch(exception& e) {
718 m->errorOut(e, "SplitMatrix", "splitNames");
722 //********************************************************************************************************************
723 int SplitMatrix::splitDistanceRAM(){
725 vector<set<string> > groups;
726 vector<string> outputs;
731 openInputFile(distFile, dFile);
737 dFile >> seqA >> seqB >> dist;
739 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
742 //cout << "in cutoff: " << dist << endl;
747 for(int i=0;i<numGroups;i++){
748 set<string>::iterator aIt = groups[i].find(seqA);
749 set<string>::iterator bIt = groups[i].find(seqB);
751 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
752 groups[i].insert(seqB);
756 //cout << "in aIt: " << groupID << endl;
759 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
760 groups[i].insert(seqA);
764 // cout << "in bIt: " << groupID << endl;
768 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
769 if(groupIDA < groupIDB){
770 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
771 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
772 groups[groupIDB].clear();
776 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
777 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
778 groups[groupIDA].clear();
785 //windows is gonna gag on the reuse of outFile, will need to make it local...
787 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
788 set<string> newGroup;
789 newGroup.insert(seqA);
790 newGroup.insert(seqB);
791 groups.push_back(newGroup);
793 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
794 outputs.push_back(tempOut);
799 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
801 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
802 string row, column, distance;
803 if(groupIDA<groupIDB){
805 outputs[groupID] += outputs[groupIDB];
806 outputs[groupIDB] = "";
808 outputs[groupID] += outputs[groupIDA];
809 outputs[groupIDA] = "";
818 for (int i = 0; i < numGroups; i++) {
819 if (outputs[i] != "") {
821 string fileName = distFile + "." + toString(i) + ".temp";
822 outFile.open(fileName.c_str(), ios::ate);
823 outFile << outputs[i];
832 catch(exception& e) {
833 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
837 //********************************************************************************************************************
838 //sorts biggest to smallest
839 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
844 //get num bytes in file
845 string filename = left.begin()->first;
846 pFile = fopen (filename.c_str(),"rb");
847 string error = "Error opening " + filename;
848 if (pFile==NULL) perror (error.c_str());
850 fseek (pFile, 0, SEEK_END);
851 leftsize=ftell (pFile);
858 //get num bytes in file
859 filename = right.begin()->first;
860 pFile2 = fopen (filename.c_str(),"rb");
861 error = "Error opening " + filename;
862 if (pFile2==NULL) perror (error.c_str());
864 fseek (pFile2, 0, SEEK_END);
865 rightsize=ftell (pFile2);
869 return (leftsize > rightsize);
871 /***********************************************************************/
872 //returns map of distance files -> namefile sorted by distance file size
873 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
876 sort(dists.begin(), dists.end(), compareFileSizes);
880 catch(exception& e) {
881 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
885 /***********************************************************************/
886 SplitMatrix::~SplitMatrix(){}
887 /***********************************************************************/