5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
15 /***********************************************************************/
17 SplitMatrix::SplitMatrix(string distfile, string name, string count, string tax, float c, string t, bool l){
18 m = MothurOut::getInstance();
27 /***********************************************************************/
29 SplitMatrix::SplitMatrix(string ffile, string name, string count, string tax, float c, float cu, string t, int p, bool cl, string output){
30 m = MothurOut::getInstance();
35 cutoff = c; //tax level cutoff
36 distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
43 /***********************************************************************/
45 int SplitMatrix::split(){
48 if (method == "distance") {
50 }else if ((method == "classify") || (method == "fasta")) {
53 m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
54 map<string, string> temp;
55 if (namefile != "") { temp[distFile] = namefile; }
56 else { temp[distFile] = countfile; }
57 dists.push_back(temp);
63 m->errorOut(e, "SplitMatrix", "split");
67 /***********************************************************************/
68 int SplitMatrix::splitDistance(){
71 if (large) { splitDistanceLarge(); }
72 else { splitDistanceRAM(); }
78 m->errorOut(e, "SplitMatrix", "splitDistance");
83 /***********************************************************************/
84 int SplitMatrix::splitClassify(){
88 map<string, int> seqGroup;
89 map<string, int>::iterator it;
90 map<string, int>::iterator it2;
94 //build tree from users taxonomy file
95 PhyloTree* phylo = new PhyloTree();
97 map<string, string> temp;
98 m->readTax(taxFile, temp);
100 for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
101 phylo->addSeqToTree(itTemp->first, itTemp->second);
102 temp.erase(itTemp++);
105 phylo->assignHeirarchyIDs(0);
107 //make sure the cutoff is not greater than maxlevel
108 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
110 //for each node in tree
111 for (int i = 0; i < phylo->getNumNodes(); i++) {
113 //is this node within the cutoff
114 TaxNode taxon = phylo->get(i);
116 if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
117 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
118 for (int j = 0; j < taxon.accessions.size(); j++) {
119 seqGroup[taxon.accessions[j]] = numGroups;
128 if (method == "classify") {
129 splitDistanceFileByTax(seqGroup, numGroups);
131 createDistanceFilesFromTax(seqGroup, numGroups);
137 catch(exception& e) {
138 m->errorOut(e, "SplitMatrix", "splitClassify");
142 /***********************************************************************/
143 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
145 map<string, int> copyGroups = seqGroup;
146 map<string, int>::iterator it;
149 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
150 m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
154 m->openInputFile(fastafile, in);
159 Sequence query(in); m->gobble(in);
160 if (query.getName() != "") {
162 it = seqGroup.find(query.getName());
164 //save names in case no namefile is given
165 if ((namefile == "") && (countfile == "")) { names.insert(query.getName()); }
167 if (it != seqGroup.end()) { //not singleton
168 m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
169 query.printSequence(outFile);
172 copyGroups.erase(query.getName());
178 //warn about sequence in groups that are not in fasta file
179 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
180 m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
186 //process each distance file
187 for (int i = 0; i < numGroups; i++) {
190 if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; }
191 else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); }
192 if (outputDir != "") { options += ", outputdir=" + outputDir; }
194 m->mothurOut("/******************************************/"); m->mothurOutEndLine();
196 Command* command = new DistanceCommand(options);
198 m->mothurOut("/******************************************/"); m->mothurOutEndLine();
203 m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
205 //remove old names files just in case
206 if (namefile != "") { m->mothurRemove((namefile + "." + toString(i) + ".temp")); }
207 else { m->mothurRemove((countfile + "." + toString(i) + ".temp")); }
210 //restore old fasta file name since dist.seqs overwrites it with the temp files
211 m->setFastaFile(fastafile);
213 vector<string> tempDistFiles;
214 for(int i=0;i<numGroups;i++){
215 if (outputDir == "") { outputDir = m->hasPath(fastafile); }
216 string tempDistFile = "";
217 if (classic) { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";}
218 else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; }
219 tempDistFiles.push_back(tempDistFile);
222 splitNames(seqGroup, numGroups, tempDistFiles);
224 if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
228 catch(exception& e) {
229 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
233 /***********************************************************************/
234 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
236 map<string, int>::iterator it;
237 map<string, int>::iterator it2;
241 m->openInputFile(distFile, dFile);
244 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
245 m->mothurRemove((distFile + "." + toString(i) + ".temp"));
248 //for buffering the io to improve speed
249 //allow for 10 dists to be stored, then output.
250 vector<string> outputs; outputs.resize(numGroups, "");
251 vector<int> numOutputs; numOutputs.resize(numGroups, 0);
253 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
254 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
255 vector<bool> validDistances; validDistances.resize(numGroups, false);
262 if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp")); } }
264 dFile >> seqA >> seqB >> dist; m->gobble(dFile);
266 //if both sequences are in the same group then they are within the cutoff
267 it = seqGroup.find(seqA);
268 it2 = seqGroup.find(seqB);
270 if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
271 if (it->second == it2->second) { //they are from the same group so add the distance
272 if (numOutputs[it->second] > 30) {
273 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
274 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
276 outputs[it->second] = "";
277 numOutputs[it->second] = 0;
278 validDistances[it->second] = true;
280 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
281 numOutputs[it->second]++;
288 string inputFile = namefile;
289 if (countfile != "") { inputFile = countfile; }
291 vector<string> tempDistFiles;
292 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
293 string tempDistFile = distFile + "." + toString(i) + ".temp";
294 tempDistFiles.push_back(tempDistFile);
295 m->mothurRemove((inputFile + "." + toString(i) + ".temp"));
297 //write out any remaining buffers
298 if (numOutputs[i] > 0) {
299 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
300 outFile << outputs[i];
304 validDistances[i] = true;
308 splitNames(seqGroup, numGroups, tempDistFiles);
310 if (m->control_pressed) {
311 for (int i = 0; i < dists.size(); i++) {
312 m->mothurRemove((dists[i].begin()->first));
313 m->mothurRemove((dists[i].begin()->second));
320 catch(exception& e) {
321 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
325 /***********************************************************************/
326 int SplitMatrix::splitDistanceLarge(){
328 vector<set<string> > groups;
330 //for buffering the io to improve speed
331 //allow for 30 dists to be stored, then output.
332 vector<string> outputs;
333 vector<int> numOutputs;
334 vector<bool> wroteOutPut;
340 m->openInputFile(distFile, dFile);
346 dFile >> seqA >> seqB >> dist;
348 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; }
351 //cout << "in cutoff: " << dist << endl;
356 for(int i=0;i<numGroups;i++){
357 set<string>::iterator aIt = groups[i].find(seqA);
358 set<string>::iterator bIt = groups[i].find(seqB);
360 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
361 groups[i].insert(seqB);
365 //cout << "in aIt: " << groupID << endl;
368 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
369 groups[i].insert(seqA);
373 // cout << "in bIt: " << groupID << endl;
377 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
378 if(groupIDA < groupIDB){
379 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
380 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
381 groups[groupIDB].clear();
385 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
386 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
387 groups[groupIDA].clear();
394 //windows is gonna gag on the reuse of outFile, will need to make it local...
396 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
397 set<string> newGroup;
398 newGroup.insert(seqA);
399 newGroup.insert(seqB);
400 groups.push_back(newGroup);
402 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
403 outputs.push_back(tempOut);
404 numOutputs.push_back(1);
405 wroteOutPut.push_back(false);
410 string fileName = distFile + "." + toString(groupID) + ".temp";
412 //have we reached the max buffer size
413 if (numOutputs[groupID] > 60) { //write out sequence
415 outFile.open(fileName.c_str(), ios::app);
416 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
419 outputs[groupID] = "";
420 numOutputs[groupID] = 0;
421 wroteOutPut[groupID] = true;
423 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
424 numOutputs[groupID]++;
427 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
428 string row, column, distance;
429 if(groupIDA<groupIDB){
432 numOutputs[groupID] += numOutputs[groupIDB];
433 outputs[groupID] += outputs[groupIDB];
435 outputs[groupIDB] = "";
436 numOutputs[groupIDB] = 0;
438 //if groupB is written to file it is above buffer size so read and write to new merged file
439 if (wroteOutPut[groupIDB]) {
440 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
441 /*ifstream fileB(fileName2.c_str(), ios::ate);
443 outFile.open(fileName.c_str(), ios::app);
448 size = fileB.tellg();
450 fileB.seekg (0, ios::beg);
452 int numRead = size / 1024;
453 int lastRead = size % 1024;
455 for (int i = 0; i < numRead; i++) {
457 memblock = new char [1024];
459 fileB.read (memblock, 1024);
461 string temp = memblock;
462 outFile << temp.substr(0, 1024);
467 memblock = new char [lastRead];
469 fileB.read (memblock, lastRead);
471 //not sure why but it will read more than lastRead char...??
472 string temp = memblock;
473 outFile << temp.substr(0, lastRead);
477 m->appendFiles(fileName2, fileName);
478 m->mothurRemove(fileName2);
481 //write out the merged memory
482 if (numOutputs[groupID] > 60) {
484 m->openOutputFile(fileName, tempOut);
485 tempOut << outputs[groupID];
486 outputs[groupID] = "";
487 numOutputs[groupID] = 0;
493 wroteOutPut[groupID] = true;
494 wroteOutPut[groupIDB] = false;
495 }else{ } //just merge b's memory with a's memory
498 numOutputs[groupID] += numOutputs[groupIDA];
499 outputs[groupID] += outputs[groupIDA];
501 outputs[groupIDA] = "";
502 numOutputs[groupIDA] = 0;
504 if (wroteOutPut[groupIDA]) {
505 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
506 /*ifstream fileB(fileName2.c_str(), ios::ate);
508 outFile.open(fileName.c_str(), ios::app);
513 size = fileB.tellg();
515 fileB.seekg (0, ios::beg);
517 int numRead = size / 1024;
518 int lastRead = size % 1024;
520 for (int i = 0; i < numRead; i++) {
522 memblock = new char [1024];
524 fileB.read (memblock, 1024);
525 string temp = memblock;
526 outFile << temp.substr(0, 1024);
531 memblock = new char [lastRead];
533 fileB.read (memblock, lastRead);
535 //not sure why but it will read more than lastRead char...??
536 string temp = memblock;
537 outFile << temp.substr(0, lastRead);
542 m->appendFiles(fileName2, fileName);
543 m->mothurRemove(fileName2);
545 //write out the merged memory
546 if (numOutputs[groupID] > 60) {
548 m->openOutputFile(fileName, tempOut);
549 tempOut << outputs[groupID];
550 outputs[groupID] = "";
551 numOutputs[groupID] = 0;
557 wroteOutPut[groupID] = true;
558 wroteOutPut[groupIDA] = false;
559 }else { } //just merge memory
568 vector<string> tempDistFiles;
569 for (int i = 0; i < numGroups; i++) {
570 string fileName = distFile + "." + toString(i) + ".temp";
571 tempDistFiles.push_back(fileName);
572 //remove old names files just in case
574 if (numOutputs[i] > 0) {
576 outFile.open(fileName.c_str(), ios::app);
577 outFile << outputs[i];
582 map<string, int> seqGroup;
583 for (int i = 0; i < groups.size(); i++) {
584 for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
585 seqGroup[*itNames] = i;
586 groups[i].erase(itNames++);
590 splitNames(seqGroup, numGroups, tempDistFiles);
594 catch(exception& e) {
595 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
599 //********************************************************************************************************************
600 int SplitMatrix::splitNames(map<string, int>& seqGroup, int numGroups, vector<string>& tempDistFiles){
603 map<string, int>::iterator it;
605 string inputFile = namefile;
606 if (countfile != "") { inputFile = countfile; }
608 for(int i=0;i<numGroups;i++){ m->mothurRemove((inputFile + "." + toString(i) + ".temp")); }
610 singleton = inputFile + ".extra.temp";
611 ofstream remainingNames;
612 m->openOutputFile(singleton, remainingNames);
614 bool wroteExtra = false;
616 ifstream bigNameFile;
617 m->openInputFile(inputFile, bigNameFile);
621 if (countfile != "") { headers = m->getline(bigNameFile); m->gobble(bigNameFile); }
623 string name, nameList;
624 while(!bigNameFile.eof()){
625 bigNameFile >> name >> nameList;
626 m->getline(bigNameFile); m->gobble(bigNameFile); //extra getline is for rest of countfile line if groups are given.
628 //did this sequence get assigned a group
629 it = seqGroup.find(name);
631 if (it != seqGroup.end()) {
632 m->openOutputFileAppend((inputFile + "." + toString(it->second) + ".temp"), outFile);
633 outFile << name << '\t' << nameList << endl;
637 remainingNames << name << '\t' << nameList << endl;
642 for(int i=0;i<numGroups;i++){
643 string tempNameFile = inputFile + "." + toString(i) + ".temp";
644 string tempDistFile = tempDistFiles[i];
646 //if there are valid distances
648 fileHandle.open(tempDistFile.c_str());
650 m->gobble(fileHandle);
651 if (!fileHandle.eof()) { //check
652 map<string, string> temp;
653 if (countfile != "") {
656 string newtempNameFile = tempNameFile + "2";
657 m->openOutputFile(newtempNameFile, out);
658 out << "Representative_Sequence\ttotal" << endl;
660 m->appendFiles(tempNameFile, newtempNameFile);
661 m->mothurRemove(tempNameFile);
662 m->renameFile(newtempNameFile, tempNameFile);
664 temp[tempDistFile] = tempNameFile;
665 dists.push_back(temp);
668 m->openInputFile(tempNameFile, in);
671 in >> name >> nameList; m->gobble(in);
673 remainingNames << name << '\t' << nameList << endl;
676 m->mothurRemove(tempNameFile);
682 remainingNames.close();
685 m->mothurRemove(singleton);
687 }else if (countfile != "") {
690 string newtempNameFile = singleton + "2";
691 m->openOutputFile(newtempNameFile, out);
692 out << "Representative_Sequence\ttotal" << endl;
694 m->appendFiles(singleton, newtempNameFile);
695 m->mothurRemove(singleton);
696 m->renameFile(newtempNameFile, singleton);
701 catch(exception& e) {
702 m->errorOut(e, "SplitMatrix", "splitNames");
706 //********************************************************************************************************************
707 int SplitMatrix::splitDistanceRAM(){
709 vector<set<string> > groups;
710 vector<string> outputs;
715 m->openInputFile(distFile, dFile);
721 dFile >> seqA >> seqB >> dist;
723 if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; }
726 //cout << "in cutoff: " << dist << endl;
731 for(int i=0;i<numGroups;i++){
732 set<string>::iterator aIt = groups[i].find(seqA);
733 set<string>::iterator bIt = groups[i].find(seqB);
735 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
736 groups[i].insert(seqB);
740 //cout << "in aIt: " << groupID << endl;
743 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
744 groups[i].insert(seqA);
748 // cout << "in bIt: " << groupID << endl;
752 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
753 if(groupIDA < groupIDB){
754 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
755 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
756 groups[groupIDB].clear();
760 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
761 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
762 groups[groupIDA].clear();
769 //windows is gonna gag on the reuse of outFile, will need to make it local...
771 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
772 set<string> newGroup;
773 newGroup.insert(seqA);
774 newGroup.insert(seqB);
775 groups.push_back(newGroup);
777 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
778 outputs.push_back(tempOut);
783 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
785 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
786 string row, column, distance;
787 if(groupIDA<groupIDB){
789 outputs[groupID] += outputs[groupIDB];
790 outputs[groupIDB] = "";
792 outputs[groupID] += outputs[groupIDA];
793 outputs[groupIDA] = "";
802 vector<string> tempDistFiles;
803 for (int i = 0; i < numGroups; i++) {
804 string fileName = distFile + "." + toString(i) + ".temp";
805 tempDistFiles.push_back(fileName);
806 if (outputs[i] != "") {
808 outFile.open(fileName.c_str(), ios::ate);
809 outFile << outputs[i];
814 map<string, int> seqGroup;
815 for (int i = 0; i < groups.size(); i++) {
816 for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
817 seqGroup[*itNames] = i;
818 groups[i].erase(itNames++);
822 splitNames(seqGroup, numGroups, tempDistFiles);
826 catch(exception& e) {
827 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
831 //********************************************************************************************************************
832 //sorts biggest to smallest
833 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
838 //get num bytes in file
839 string filename = left.begin()->first;
840 pFile = fopen (filename.c_str(),"rb");
841 string error = "Error opening " + filename;
842 if (pFile==NULL) perror (error.c_str());
844 fseek (pFile, 0, SEEK_END);
845 leftsize=ftell (pFile);
852 //get num bytes in file
853 filename = right.begin()->first;
854 pFile2 = fopen (filename.c_str(),"rb");
855 error = "Error opening " + filename;
856 if (pFile2==NULL) perror (error.c_str());
858 fseek (pFile2, 0, SEEK_END);
859 rightsize=ftell (pFile2);
863 return (leftsize > rightsize);
865 /***********************************************************************/
866 //returns map of distance files -> namefile sorted by distance file size
867 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
870 sort(dists.begin(), dists.end(), compareFileSizes);
874 catch(exception& e) {
875 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
879 /***********************************************************************/
880 SplitMatrix::~SplitMatrix(){}
881 /***********************************************************************/