5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
15 /***********************************************************************/
17 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
18 m = MothurOut::getInstance();
26 /***********************************************************************/
28 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, string output){
29 m = MothurOut::getInstance();
33 cutoff = c; //tax level cutoff
34 distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
40 /***********************************************************************/
42 int SplitMatrix::split(){
45 if (method == "distance") {
47 }else if ((method == "classify") || (method == "fasta")) {
50 m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
51 map<string, string> temp;
52 temp[distFile] = namefile;
53 dists.push_back(temp);
59 m->errorOut(e, "SplitMatrix", "split");
63 /***********************************************************************/
64 int SplitMatrix::splitDistance(){
67 if (large) { splitDistanceLarge(); }
68 else { splitDistanceRAM(); }
74 m->errorOut(e, "SplitMatrix", "splitDistance");
79 /***********************************************************************/
80 int SplitMatrix::splitClassify(){
84 map<string, int> seqGroup;
85 map<string, int>::iterator it;
86 map<string, int>::iterator it2;
90 //build tree from users taxonomy file
91 PhyloTree* phylo = new PhyloTree();
94 m->openInputFile(taxFile, in);
96 //read in users taxonomy file and add sequences to tree
99 in >> seqname >> tax; m->gobble(in);
100 phylo->addSeqToTree(seqname, tax);
104 phylo->assignHeirarchyIDs(0);
106 //make sure the cutoff is not greater than maxlevel
107 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
109 //for each node in tree
110 for (int i = 0; i < phylo->getNumNodes(); i++) {
112 //is this node within the cutoff
113 TaxNode taxon = phylo->get(i);
115 if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
116 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
117 for (int j = 0; j < taxon.accessions.size(); j++) {
118 seqGroup[taxon.accessions[j]] = numGroups;
127 if (method == "classify") {
128 splitDistanceFileByTax(seqGroup, numGroups);
130 createDistanceFilesFromTax(seqGroup, numGroups);
136 catch(exception& e) {
137 m->errorOut(e, "SplitMatrix", "splitClassify");
141 /***********************************************************************/
142 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
144 map<string, int> copyGroups = seqGroup;
145 map<string, int>::iterator it;
148 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
149 m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
153 m->openInputFile(fastafile, in);
158 Sequence query(in); m->gobble(in);
159 if (query.getName() != "") {
161 it = seqGroup.find(query.getName());
163 //save names in case no namefile is given
164 if (namefile == "") { names.insert(query.getName()); }
166 if (it != seqGroup.end()) { //not singleton
167 m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
168 query.printSequence(outFile);
171 copyGroups.erase(query.getName());
177 //warn about sequence in groups that are not in fasta file
178 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
179 m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
185 //process each distance file
186 for (int i = 0; i < numGroups; i++) {
188 string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff);
189 if (outputDir != "") { options += ", outputdir=" + outputDir; }
191 Command* command = new DistanceCommand(options);
196 m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
198 //remove old names files just in case
199 m->mothurRemove((namefile + "." + toString(i) + ".temp"));
202 singleton = namefile + ".extra.temp";
203 ofstream remainingNames;
204 m->openOutputFile(singleton, remainingNames);
206 bool wroteExtra = false;
208 ifstream bigNameFile;
209 m->openInputFile(namefile, bigNameFile);
211 string name, nameList;
212 while(!bigNameFile.eof()){
213 bigNameFile >> name >> nameList; m->gobble(bigNameFile);
215 //did this sequence get assigned a group
216 it = seqGroup.find(name);
218 if (it != seqGroup.end()) {
219 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
220 outFile << name << '\t' << nameList << endl;
224 remainingNames << name << '\t' << nameList << endl;
229 for(int i=0;i<numGroups;i++){
230 string tempNameFile = namefile + "." + toString(i) + ".temp";
231 if (outputDir == "") { outputDir = m->hasPath(fastafile); }
232 string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
234 //if there are valid distances
236 fileHandle.open(tempDistFile.c_str());
238 m->gobble(fileHandle);
239 if (!fileHandle.eof()) { //check for blank file - this could occur if all dists in group are above cutoff
240 map<string, string> temp;
241 temp[tempDistFile] = tempNameFile;
242 dists.push_back(temp);
245 m->openInputFile(tempNameFile, in);
248 in >> name >> nameList; m->gobble(in);
250 remainingNames << name << '\t' << nameList << endl;
253 m->mothurRemove(tempNameFile);
259 remainingNames.close();
261 m->mothurRemove(singleton);
265 if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
269 catch(exception& e) {
270 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
274 /***********************************************************************/
275 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
277 map<string, int>::iterator it;
278 map<string, int>::iterator it2;
281 m->openInputFile(distFile, dFile);
284 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
285 m->mothurRemove((distFile + "." + toString(i) + ".temp"));
288 //for buffering the io to improve speed
289 //allow for 10 dists to be stored, then output.
290 vector<string> outputs; outputs.resize(numGroups, "");
291 vector<int> numOutputs; numOutputs.resize(numGroups, 0);
293 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
294 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
295 vector<bool> validDistances; validDistances.resize(numGroups, false);
302 if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp")); } }
304 dFile >> seqA >> seqB >> dist; m->gobble(dFile);
306 //if both sequences are in the same group then they are within the cutoff
307 it = seqGroup.find(seqA);
308 it2 = seqGroup.find(seqB);
310 if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
311 if (it->second == it2->second) { //they are from the same group so add the distance
312 if (numOutputs[it->second] > 30) {
313 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
314 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
316 outputs[it->second] = "";
317 numOutputs[it->second] = 0;
318 validDistances[it->second] = true;
320 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
321 numOutputs[it->second]++;
328 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
329 m->mothurRemove((namefile + "." + toString(i) + ".temp"));
331 //write out any remaining buffers
332 if (numOutputs[i] > 0) {
333 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
334 outFile << outputs[i];
338 validDistances[i] = true;
342 ifstream bigNameFile;
343 m->openInputFile(namefile, bigNameFile);
345 singleton = namefile + ".extra.temp";
346 ofstream remainingNames;
347 m->openOutputFile(singleton, remainingNames);
349 bool wroteExtra = false;
351 string name, nameList;
352 while(!bigNameFile.eof()){
353 bigNameFile >> name >> nameList; m->gobble(bigNameFile);
355 //did this sequence get assigned a group
356 it = seqGroup.find(name);
358 if (it != seqGroup.end()) {
359 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
360 outFile << name << '\t' << nameList << endl;
364 remainingNames << name << '\t' << nameList << endl;
369 for(int i=0;i<numGroups;i++){
370 string tempNameFile = namefile + "." + toString(i) + ".temp";
371 string tempDistFile = distFile + "." + toString(i) + ".temp";
373 //if there are valid distances
374 if (validDistances[i]) {
375 map<string, string> temp;
376 temp[tempDistFile] = tempNameFile;
377 dists.push_back(temp);
380 m->openInputFile(tempNameFile, in);
383 in >> name >> nameList; m->gobble(in);
385 remainingNames << name << '\t' << nameList << endl;
388 m->mothurRemove(tempNameFile);
392 remainingNames.close();
395 m->mothurRemove(singleton);
399 if (m->control_pressed) {
400 for (int i = 0; i < dists.size(); i++) {
401 m->mothurRemove((dists[i].begin()->first));
402 m->mothurRemove((dists[i].begin()->second));
409 catch(exception& e) {
410 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
414 /***********************************************************************/
415 int SplitMatrix::splitDistanceLarge(){
417 vector<set<string> > groups;
419 //for buffering the io to improve speed
420 //allow for 30 dists to be stored, then output.
421 vector<string> outputs;
422 vector<int> numOutputs;
423 vector<bool> wroteOutPut;
429 m->openInputFile(distFile, dFile);
435 dFile >> seqA >> seqB >> dist;
437 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; }
440 //cout << "in cutoff: " << dist << endl;
445 for(int i=0;i<numGroups;i++){
446 set<string>::iterator aIt = groups[i].find(seqA);
447 set<string>::iterator bIt = groups[i].find(seqB);
449 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
450 groups[i].insert(seqB);
454 //cout << "in aIt: " << groupID << endl;
457 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
458 groups[i].insert(seqA);
462 // cout << "in bIt: " << groupID << endl;
466 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
467 if(groupIDA < groupIDB){
468 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
469 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
470 groups[groupIDB].clear();
474 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
475 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
476 groups[groupIDA].clear();
483 //windows is gonna gag on the reuse of outFile, will need to make it local...
485 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
486 set<string> newGroup;
487 newGroup.insert(seqA);
488 newGroup.insert(seqB);
489 groups.push_back(newGroup);
491 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
492 outputs.push_back(tempOut);
493 numOutputs.push_back(1);
494 wroteOutPut.push_back(false);
499 string fileName = distFile + "." + toString(groupID) + ".temp";
501 //have we reached the max buffer size
502 if (numOutputs[groupID] > 60) { //write out sequence
503 outFile.open(fileName.c_str(), ios::app);
504 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
507 outputs[groupID] = "";
508 numOutputs[groupID] = 0;
509 wroteOutPut[groupID] = true;
511 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
512 numOutputs[groupID]++;
515 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
516 string row, column, distance;
517 if(groupIDA<groupIDB){
520 numOutputs[groupID] += numOutputs[groupIDB];
521 outputs[groupID] += outputs[groupIDB];
523 outputs[groupIDB] = "";
524 numOutputs[groupIDB] = 0;
526 //if groupB is written to file it is above buffer size so read and write to new merged file
527 if (wroteOutPut[groupIDB]) {
528 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
529 ifstream fileB(fileName2.c_str(), ios::ate);
531 outFile.open(fileName.c_str(), ios::app);
536 size = fileB.tellg();
538 fileB.seekg (0, ios::beg);
540 int numRead = size / 1024;
541 int lastRead = size % 1024;
543 for (int i = 0; i < numRead; i++) {
545 memblock = new char [1024];
547 fileB.read (memblock, 1024);
549 string temp = memblock;
550 outFile << temp.substr(0, 1024);
555 memblock = new char [lastRead];
557 fileB.read (memblock, lastRead);
559 //not sure why but it will read more than lastRead char...??
560 string temp = memblock;
561 outFile << temp.substr(0, lastRead);
565 m->mothurRemove(fileName2);
567 //write out the merged memory
568 if (numOutputs[groupID] > 60) {
569 outFile << outputs[groupID];
570 outputs[groupID] = "";
571 numOutputs[groupID] = 0;
576 wroteOutPut[groupID] = true;
577 wroteOutPut[groupIDB] = false;
578 }else{ } //just merge b's memory with a's memory
581 numOutputs[groupID] += numOutputs[groupIDA];
582 outputs[groupID] += outputs[groupIDA];
584 outputs[groupIDA] = "";
585 numOutputs[groupIDA] = 0;
587 if (wroteOutPut[groupIDA]) {
588 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
589 ifstream fileB(fileName2.c_str(), ios::ate);
591 outFile.open(fileName.c_str(), ios::app);
596 size = fileB.tellg();
598 fileB.seekg (0, ios::beg);
600 int numRead = size / 1024;
601 int lastRead = size % 1024;
603 for (int i = 0; i < numRead; i++) {
605 memblock = new char [1024];
607 fileB.read (memblock, 1024);
608 string temp = memblock;
609 outFile << temp.substr(0, 1024);
614 memblock = new char [lastRead];
616 fileB.read (memblock, lastRead);
618 //not sure why but it will read more than lastRead char...??
619 string temp = memblock;
620 outFile << temp.substr(0, lastRead);
625 m->mothurRemove(fileName2);
627 //write out the merged memory
628 if (numOutputs[groupID] > 60) {
629 outFile << outputs[groupID];
630 outputs[groupID] = "";
631 numOutputs[groupID] = 0;
636 wroteOutPut[groupID] = true;
637 wroteOutPut[groupIDA] = false;
638 }else { } //just merge memory
647 for (int i = 0; i < numGroups; i++) {
648 if (numOutputs[i] > 0) {
649 string fileName = distFile + "." + toString(i) + ".temp";
650 outFile.open(fileName.c_str(), ios::app);
651 outFile << outputs[i];
660 catch(exception& e) {
661 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
665 //********************************************************************************************************************
666 int SplitMatrix::splitNames(vector<set<string> >& groups){
668 int numGroups = groups.size();
670 ifstream bigNameFile(namefile.c_str());
672 cerr << "Error: We can't open the name file\n";
676 map<string, string> nameMap;
677 string name, nameList;
679 bigNameFile >> name >> nameList;
680 nameMap[name] = nameList;
681 m->gobble(bigNameFile);
685 for(int i=0;i<numGroups;i++){ //parse names file to match distance files
686 int numSeqsInGroup = groups[i].size();
688 if(numSeqsInGroup > 0){
689 string fileName = namefile + "." + toString(i) + ".temp";
690 ofstream smallNameFile(fileName.c_str(), ios::ate);
692 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
693 map<string,string>::iterator nIt = nameMap.find(*gIt);
694 if (nIt != nameMap.end()) {
695 smallNameFile << nIt->first << '\t' << nIt->second << endl;
698 m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1);
701 smallNameFile.close();
705 //names of singletons
706 if (nameMap.size() != 0) {
707 singleton = namefile + ".extra.temp";
708 ofstream remainingNames(singleton.c_str(), ios::ate);
709 for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
710 remainingNames << nIt->first << '\t' << nIt->second << endl;
712 remainingNames.close();
713 }else { singleton = "none"; }
715 for(int i=0;i<numGroups;i++){
716 if(groups[i].size() > 0){
717 string tempNameFile = namefile + "." + toString(i) + ".temp";
718 string tempDistFile = distFile + "." + toString(i) + ".temp";
720 map<string, string> temp;
721 temp[tempDistFile] = tempNameFile;
722 dists.push_back(temp);
726 if (m->control_pressed) {
727 for (int i = 0; i < dists.size(); i++) {
728 m->mothurRemove((dists[i].begin()->first));
729 m->mothurRemove((dists[i].begin()->second));
736 catch(exception& e) {
737 m->errorOut(e, "SplitMatrix", "splitNames");
741 //********************************************************************************************************************
742 int SplitMatrix::splitDistanceRAM(){
744 vector<set<string> > groups;
745 vector<string> outputs;
750 m->openInputFile(distFile, dFile);
756 dFile >> seqA >> seqB >> dist;
758 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; }
761 //cout << "in cutoff: " << dist << endl;
766 for(int i=0;i<numGroups;i++){
767 set<string>::iterator aIt = groups[i].find(seqA);
768 set<string>::iterator bIt = groups[i].find(seqB);
770 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
771 groups[i].insert(seqB);
775 //cout << "in aIt: " << groupID << endl;
778 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
779 groups[i].insert(seqA);
783 // cout << "in bIt: " << groupID << endl;
787 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
788 if(groupIDA < groupIDB){
789 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
790 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
791 groups[groupIDB].clear();
795 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
796 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
797 groups[groupIDA].clear();
804 //windows is gonna gag on the reuse of outFile, will need to make it local...
806 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
807 set<string> newGroup;
808 newGroup.insert(seqA);
809 newGroup.insert(seqB);
810 groups.push_back(newGroup);
812 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
813 outputs.push_back(tempOut);
818 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
820 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
821 string row, column, distance;
822 if(groupIDA<groupIDB){
824 outputs[groupID] += outputs[groupIDB];
825 outputs[groupIDB] = "";
827 outputs[groupID] += outputs[groupIDA];
828 outputs[groupIDA] = "";
837 for (int i = 0; i < numGroups; i++) {
838 if (outputs[i] != "") {
840 string fileName = distFile + "." + toString(i) + ".temp";
841 outFile.open(fileName.c_str(), ios::ate);
842 outFile << outputs[i];
851 catch(exception& e) {
852 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
856 //********************************************************************************************************************
857 //sorts biggest to smallest
858 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
863 //get num bytes in file
864 string filename = left.begin()->first;
865 pFile = fopen (filename.c_str(),"rb");
866 string error = "Error opening " + filename;
867 if (pFile==NULL) perror (error.c_str());
869 fseek (pFile, 0, SEEK_END);
870 leftsize=ftell (pFile);
877 //get num bytes in file
878 filename = right.begin()->first;
879 pFile2 = fopen (filename.c_str(),"rb");
880 error = "Error opening " + filename;
881 if (pFile2==NULL) perror (error.c_str());
883 fseek (pFile2, 0, SEEK_END);
884 rightsize=ftell (pFile2);
888 return (leftsize > rightsize);
890 /***********************************************************************/
891 //returns map of distance files -> namefile sorted by distance file size
892 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
895 sort(dists.begin(), dists.end(), compareFileSizes);
899 catch(exception& e) {
900 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
904 /***********************************************************************/
905 SplitMatrix::~SplitMatrix(){}
906 /***********************************************************************/