]> git.donarmstrong.com Git - mothur.git/blobdiff - splitmatrix.cpp
made make.table alias to count.seqs command. added large parameter to count.seqs...
[mothur.git] / splitmatrix.cpp
index 0929cb09167aebb2362bc6f29bf01c710d79ec8d..384b09af1bb94be09c5607c8b863b28b16215731 100644 (file)
@@ -9,6 +9,8 @@
 
 #include "splitmatrix.h"
 #include "phylotree.h"
+#include "distancecommand.h"
+#include "seqsummarycommand.h"
 
 /***********************************************************************/
 
@@ -21,6 +23,20 @@ SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, stri
        taxFile = tax;
        large = l;
 }
+/***********************************************************************/
+
+SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, bool cl, string output){
+       m = MothurOut::getInstance();
+       fastafile = ffile;
+       namefile = name;
+       taxFile = tax;
+       cutoff = c;  //tax level cutoff
+       distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
+       method = t;
+       processors = p;
+    classic = cl;
+       outputDir = output;
+}
 
 /***********************************************************************/
 
@@ -29,7 +45,7 @@ int SplitMatrix::split(){
         
                if (method == "distance") {  
                        splitDistance();
-               }else if (method == "classify") {
+               }else if ((method == "classify") || (method == "fasta")) {
                        splitClassify();
                }else {
                        m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
@@ -51,6 +67,8 @@ int SplitMatrix::splitDistance(){
         
                if (large)      { splitDistanceLarge(); }
                else            { splitDistanceRAM();   }
+               
+               return 0;
                        
        }
        catch(exception& e) {
@@ -63,7 +81,7 @@ int SplitMatrix::splitDistance(){
 int SplitMatrix::splitClassify(){
        try {
                cutoff = int(cutoff);
-               
+                               
                map<string, int> seqGroup;
                map<string, int>::iterator it;
                map<string, int>::iterator it2;
@@ -73,29 +91,25 @@ int SplitMatrix::splitClassify(){
                //build tree from users taxonomy file
                PhyloTree* phylo = new PhyloTree();
                
-               ifstream in;
-               openInputFile(taxFile, in);
-                       
-               //read in users taxonomy file and add sequences to tree
-               string seqname, tax;
-               while(!in.eof()){
-                       in >> seqname >> tax; gobble(in);
-                               
-                       phylo->addSeqToTree(seqname, tax);
-               }
-               in.close();
+        map<string, string> temp;
+        m->readTax(taxFile, temp);
+        
+        for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
+            phylo->addSeqToTree(itTemp->first, itTemp->second);
+            temp.erase(itTemp++);
+        }
                
                phylo->assignHeirarchyIDs(0);
 
                //make sure the cutoff is not greater than maxlevel
                if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
-               
+       
                //for each node in tree
                for (int i = 0; i < phylo->getNumNodes(); i++) {
                
                        //is this node within the cutoff
                        TaxNode taxon = phylo->get(i);
-               
+       
                        if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
                                if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
                                        for (int j = 0; j < taxon.accessions.size(); j++) {
@@ -105,29 +119,191 @@ int SplitMatrix::splitClassify(){
                                }
                        }
                }
+       
+               delete phylo;
+               
+               if (method == "classify") {
+                       splitDistanceFileByTax(seqGroup, numGroups);
+               }else {
+                       createDistanceFilesFromTax(seqGroup, numGroups);
+               }
+               
+               return 0;
+                       
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SplitMatrix", "splitClassify");
+               exit(1);
+       }
+}
+/***********************************************************************/
+int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
+       try {
+               map<string, int> copyGroups = seqGroup;
+               map<string, int>::iterator it;
+               set<string> names;
+                               
+               for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+                       m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
+               }
+                       
+               ifstream in;
+               m->openInputFile(fastafile, in);
+       
+               //parse fastafile
+               ofstream outFile;
+               while (!in.eof()) {
+                       Sequence query(in); m->gobble(in);
+                       if (query.getName() != "") {
+               
+                               it = seqGroup.find(query.getName());
+                               
+                               //save names in case no namefile is given
+                               if (namefile == "") {  names.insert(query.getName()); }
+                       
+                               if (it != seqGroup.end()) { //not singleton 
+                                       m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
+                                       query.printSequence(outFile); 
+                                       outFile.close();
+                                       
+                                       copyGroups.erase(query.getName());
+                               }
+                       }
+               }
+               in.close();
+               
+               //warn about sequence in groups that are not in fasta file
+               for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
+                       m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
+                       exit(1);
+               }
+               
+               copyGroups.clear();
+               
+               //process each distance file
+               for (int i = 0; i < numGroups; i++) { 
+                       
+                       string options = "";
+            if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; }
+            else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); }
+                       if (outputDir != "") { options += ", outputdir=" + outputDir; }
+                       
+                       Command* command = new DistanceCommand(options);
+                       
+                       command->execute();
+                       delete command;
+                       
+                       m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
+                       
+                       //remove old names files just in case
+                       m->mothurRemove((namefile + "." + toString(i) + ".temp"));
+               }
+                       
+               singleton = namefile + ".extra.temp";
+               ofstream remainingNames;
+               m->openOutputFile(singleton, remainingNames);
+               
+               bool wroteExtra = false;
 
+               ifstream bigNameFile;
+               m->openInputFile(namefile, bigNameFile);
+               
+               string name, nameList;
+               while(!bigNameFile.eof()){
+                       bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
+                       
+                       //did this sequence get assigned a group
+                       it = seqGroup.find(name);
+                       
+                       if (it != seqGroup.end()) {  
+                               m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
+                               outFile << name << '\t' << nameList << endl;
+                               outFile.close();
+                       }else{
+                               wroteExtra = true;
+                               remainingNames << name << '\t' << nameList << endl;
+                       }
+               }
+               bigNameFile.close();
+               
+               for(int i=0;i<numGroups;i++){
+                       string tempNameFile = namefile + "." + toString(i) + ".temp";
+                       if (outputDir == "") { outputDir = m->hasPath(fastafile); }
+                       string tempDistFile = "";
+            if (classic) { tempDistFile =  outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";}
+            else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; }
+
+                       //if there are valid distances
+                       ifstream fileHandle;
+                       fileHandle.open(tempDistFile.c_str());
+                       if(fileHandle)  {       
+                               m->gobble(fileHandle);
+                               if (!fileHandle.eof()) {  //check for blank file - this could occur if all dists in group are above cutoff
+                                       map<string, string> temp;
+                                       temp[tempDistFile] = tempNameFile;
+                                       dists.push_back(temp);
+                               }else {
+                                       ifstream in;
+                                       m->openInputFile(tempNameFile, in);
+                               
+                                       while(!in.eof()) { 
+                                               in >> name >> nameList;  m->gobble(in);
+                                               wroteExtra = true;
+                                               remainingNames << name << '\t' << nameList << endl;
+                                       }
+                                       in.close();
+                                       m->mothurRemove(tempNameFile);
+                               }
+                       }
+                       fileHandle.close();
+               }
+               
+               remainingNames.close();
+               if (!wroteExtra) { 
+                       m->mothurRemove(singleton);
+                       singleton = "none";
+               }
+
+               if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
+               exit(1);
+       }
+}
+/***********************************************************************/
+int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
+       try {
+               map<string, int>::iterator it;
+               map<string, int>::iterator it2;
+               
                ifstream dFile;
-               openInputFile(distFile, dFile);
+               m->openInputFile(distFile, dFile);
                ofstream outFile;
                
                for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
-                       remove((distFile + "." + toString(i) + ".temp").c_str());
+                       m->mothurRemove((distFile + "." + toString(i) + ".temp"));
                }
                
-               
                //for buffering the io to improve speed
                 //allow for 10 dists to be stored, then output.
                vector<string> outputs;  outputs.resize(numGroups, "");
                vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
                
+               //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
+               //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
+               vector<bool> validDistances;   validDistances.resize(numGroups, false); 
+               
                //for each distance
                while(dFile){
                        string seqA, seqB;
                        float dist;
                        
-                       if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());        } }
+                       if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp"));       } }
                        
-                       dFile >> seqA >> seqB >> dist;  gobble(dFile);
+                       dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
                        
                        //if both sequences are in the same group then they are within the cutoff
                        it = seqGroup.find(seqA);
@@ -135,12 +311,13 @@ int SplitMatrix::splitClassify(){
                        
                        if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
                                if (it->second == it2->second) { //they are from the same group so add the distance
-                                       if (numOutputs[it->second] > 10) {
-                                               openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
+                                       if (numOutputs[it->second] > 30) {
+                                               m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
                                                outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
                                                outFile.close();
                                                outputs[it->second] = "";
                                                numOutputs[it->second] = 0;
+                                               validDistances[it->second] = true;
                                        }else{
                                                outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
                                                numOutputs[it->second]++;
@@ -151,36 +328,37 @@ int SplitMatrix::splitClassify(){
                dFile.close();
        
                for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
-                       remove((namefile + "." + toString(i) + ".temp").c_str());
+                       m->mothurRemove((namefile + "." + toString(i) + ".temp"));
                        
                        //write out any remaining buffers
-                       if (numOutputs[it->second] > 0) {
-                               openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
+                       if (numOutputs[i] > 0) {
+                               m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
                                outFile << outputs[i];
                                outFile.close();
                                outputs[i] = "";
                                numOutputs[i] = 0;
+                               validDistances[i] = true;
                        }
                }
                
                ifstream bigNameFile;
-               openInputFile(namefile, bigNameFile);
+               m->openInputFile(namefile, bigNameFile);
                
                singleton = namefile + ".extra.temp";
                ofstream remainingNames;
-               openOutputFile(singleton, remainingNames);
+               m->openOutputFile(singleton, remainingNames);
                
                bool wroteExtra = false;
                                                
                string name, nameList;
                while(!bigNameFile.eof()){
-                       bigNameFile >> name >> nameList;  gobble(bigNameFile);
+                       bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
                        
                        //did this sequence get assigned a group
                        it = seqGroup.find(name);
                        
                        if (it != seqGroup.end()) {  
-                               openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
+                               m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
                                outFile << name << '\t' << nameList << endl;
                                outFile.close();
                        }else{
@@ -189,35 +367,49 @@ int SplitMatrix::splitClassify(){
                        }
                }
                bigNameFile.close();
-               remainingNames.close();
-               
-               if (!wroteExtra) { 
-                       remove(singleton.c_str());
-                       singleton = "none";
-               }
-                       
+                               
                for(int i=0;i<numGroups;i++){
                        string tempNameFile = namefile + "." + toString(i) + ".temp";
                        string tempDistFile = distFile + "." + toString(i) + ".temp";
+
+                       //if there are valid distances
+                       if (validDistances[i]) {
+                               map<string, string> temp;
+                               temp[tempDistFile] = tempNameFile;
+                               dists.push_back(temp);
+                       }else{
+                               ifstream in;
+                               m->openInputFile(tempNameFile, in);
                                
-                       map<string, string> temp;
-                       temp[tempDistFile] = tempNameFile;
-                       dists.push_back(temp);
+                               while(!in.eof()) { 
+                                       in >> name >> nameList;  m->gobble(in);
+                                       wroteExtra = true;
+                                       remainingNames << name << '\t' << nameList << endl;
+                               }
+                               in.close();
+                               m->mothurRemove(tempNameFile);
+                       }
                }
                
+               remainingNames.close();
+               
+               if (!wroteExtra) { 
+                       m->mothurRemove(singleton);
+                       singleton = "none";
+               }
+
                if (m->control_pressed)  {  
                        for (int i = 0; i < dists.size(); i++) { 
-                               remove((dists[i].begin()->first).c_str());
-                               remove((dists[i].begin()->second).c_str());
+                               m->mothurRemove((dists[i].begin()->first));
+                               m->mothurRemove((dists[i].begin()->second));
                        }
                        dists.clear();
                }
                
                return 0;
-                       
        }
        catch(exception& e) {
-               m->errorOut(e, "SplitMatrix", "splitClassify");
+               m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
                exit(1);
        }
 }
@@ -236,7 +428,7 @@ int SplitMatrix::splitDistanceLarge(){
 
                ofstream outFile;
                ifstream dFile;
-               openInputFile(distFile, dFile);
+               m->openInputFile(distFile, dFile);
        
                while(dFile){
                        string seqA, seqB;
@@ -244,7 +436,7 @@ int SplitMatrix::splitDistanceLarge(){
 
                        dFile >> seqA >> seqB >> dist;
                        
-                       if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
+                       if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
                                        
                        if(dist < cutoff){
                                //cout << "in cutoff: " << dist << endl;
@@ -372,7 +564,7 @@ int SplitMatrix::splitDistanceLarge(){
                                                                delete memblock;
                                                                
                                                                fileB.close();
-                                                               remove(fileName2.c_str());
+                                                               m->mothurRemove(fileName2);
                                                                
                                                                //write out the merged memory
                                                                if (numOutputs[groupID] > 60) {
@@ -432,7 +624,7 @@ int SplitMatrix::splitDistanceLarge(){
                                                                delete memblock;
                                                                
                                                                fileB.close();
-                                                               remove(fileName2.c_str());
+                                                               m->mothurRemove(fileName2);
                                                                
                                                                //write out the merged memory
                                                                if (numOutputs[groupID] > 60) {
@@ -450,7 +642,7 @@ int SplitMatrix::splitDistanceLarge(){
                                        }
                                }
                        }
-                       gobble(dFile);
+                       m->gobble(dFile);
                }
                dFile.close();
                
@@ -488,7 +680,7 @@ int SplitMatrix::splitNames(vector<set<string> >& groups){
                while(bigNameFile){
                        bigNameFile >> name >> nameList;
                        nameMap[name] = nameList;
-                       gobble(bigNameFile);
+                       m->gobble(bigNameFile);
                }
                bigNameFile.close();
                        
@@ -535,8 +727,8 @@ int SplitMatrix::splitNames(vector<set<string> >& groups){
                
                if (m->control_pressed)  {  
                        for (int i = 0; i < dists.size(); i++) { 
-                               remove((dists[i].begin()->first).c_str());
-                               remove((dists[i].begin()->second).c_str());
+                               m->mothurRemove((dists[i].begin()->first));
+                               m->mothurRemove((dists[i].begin()->second));
                        }
                        dists.clear();
                }
@@ -557,7 +749,7 @@ int SplitMatrix::splitDistanceRAM(){
                int numGroups = 0;
 
                ifstream dFile;
-               openInputFile(distFile, dFile);
+               m->openInputFile(distFile, dFile);
 
                while(dFile){
                        string seqA, seqB;
@@ -565,7 +757,7 @@ int SplitMatrix::splitDistanceRAM(){
 
                        dFile >> seqA >> seqB >> dist;
                        
-                       if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
+                       if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
                                        
                        if(dist < cutoff){
                                //cout << "in cutoff: " << dist << endl;
@@ -640,7 +832,7 @@ int SplitMatrix::splitDistanceRAM(){
                                        }
                                }
                        }
-                       gobble(dFile);
+                       m->gobble(dFile);
                }
                dFile.close();